##                                                  Day 1 - Part_1 LAB

###  REGRESSION MACHINE LEARNING - Baseline Model Implementation

In [None]:
#importing required libraries for data analysis
import pandas as pd
import numpy as np
import time

# Import models from sklearn
from sklearn.linear_model import LinearRegression,ElasticNet,Ridge,Lasso,LassoCV,RidgeCV
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

# Import evaluation metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

#Visuals
import matplotlib.pyplot as plt

In [None]:
# Read the training & test datasets from Part1-Preprocessing part

X_train=pd.read_csv("./regression_dataset/X_train.csv", index_col=0)
y_train=pd.read_csv("./regression_dataset/y_train.csv", index_col=0)
                    
X_test=pd.read_csv("./regression_dataset/X_test.csv", index_col=0)
y_test=pd.read_csv("./regression_dataset/y_test.csv", index_col=0)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

## 1. Linear Regression Model

In [None]:
# Importing the Linear Regression Model
lrmodel = LinearRegression()
start = time.time()
#Train the model
lrmodel.fit(X_train, y_train)
stop = time.time()

# predicting the y test observations
y_pred = lrmodel.predict(X_test)
y_train_pred = lrmodel.predict(X_train)

In [None]:
# y = beta0 + beta1 * X1 + beta2 * X3 +...+ E
intercept= lrmodel.intercept_[0]
coefficients=lrmodel.coef_[0]
print("y =",intercept," + ")
for i in range(len(coefficients)):
    print(coefficients[i],"*",X_train.columns[i]," + ", end="")
print(" E")

In [None]:
# predicting the y test observations manually
y_pred_mn=intercept + np.sum(coefficients * X_test, axis=1)
plt.scatter(y_pred_mn,y_pred)
plt.xlabel("y_pred")
plt.ylabel("y_pred_manual")

### Feature Importance 

In regression analysis, the magnitude of your coefficients is not necessarily related to their importance.

In [None]:
stdevs= []
for i in X_train.columns:
    stdev=X_train[i].std()
    stdevs.append(stdev)

In [None]:
features=pd.DataFrame(coefficients, X_train.columns, columns=['coef'])

#normalized feature importance 
features['importance']=np.array(stdevs)*features["coef"].abs()/max(np.array(stdevs)*features["coef"].abs()) 
features = features.sort_values(by = "importance")
features.reset_index(inplace=True)
features = features.rename(columns = {'index':'features'})

In [None]:
#Feature Importance
plt.figure(figsize=(10,5))
plt.barh(features['features'][-15:], features['importance'][-15:])
plt.xlabel("Feature Importance")

### Model Evaluation

In [None]:
#Model Evaluation
lrm_trscore = round(lrmodel.score(X_train, y_train),3)
lrm_tescore = round(lrmodel.score(X_test, y_test),3)
lrm_r2= round(r2_score(y_test,y_pred),3) #coefficient of determination
lrm_adj= round(1 - (1-lrm_r2)*(len(y_test)-1)/(len(y_test)-len(X_test.columns)-1),3)
lrm_mae= round(mean_absolute_error(y_test,y_pred),3)                      
lrm_mse= round(mean_squared_error(y_test,y_pred),3)
lrm_rmse= round(np.sqrt(mean_squared_error(y_test,y_pred)),3)
lrm_feat=np.array(features['features'][::-1][:3])
#Training time of the model
lrm_time=stop-start

results = pd.DataFrame([['Linear Regression', lrm_trscore, lrm_tescore, lrm_adj, lrm_mae, lrm_mse, lrm_rmse, lrm_time, lrm_feat]],
               columns = ['Model', 'Train Score', 'Test Score/R-Square','Adjusted R^2', 'MAE', 'MSE', 'RMSE', "Training Time(s)","Important Features"])
pd.set_option('display.max_colwidth', 100)
results

In [None]:
# Test vs Prediction
plt.figure(figsize=(6,6))
plt.scatter(y_test,y_pred)

### Task 1 - **Build a different model, Evaluate the predictions & Compare the results**

In [None]:
# Import the Model from sklearn



In [None]:
# Fit the Model & Predict  - X_train, y_train, X_test, y_test





In [None]:
# Visualize Important Features





In [None]:
# Evaluation the model predictions

###  **Build a different model, Evaluate the predictions & Compare the results**


