# Linear Regression 

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html?highlight=linear%20regression#

Sample regression line:

\begin{equation}
\hat{Y}_i = \hat{\beta}_0 + \hat{\beta}_1 X_i + \hat{\epsilon}_i
\end{equation}

## Diabetes dataset

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline

In [None]:

# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Use only one feature
diabetes_X = diabetes_X[:, np.newaxis, 2]

In [None]:
print(diabetes_X.shape)
print(diabetes_y.shape)

In [None]:
type(diabetes_X)

In [None]:
# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

In [None]:
print(diabetes_X_train.shape)
print(diabetes_X_test.shape)

In [None]:
# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]

In [None]:
print(diabetes_y_train.shape)
print(diabetes_y_test.shape)

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()

In [None]:

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

In [None]:

# The coefficients
print('Coefficients: \n', regr.coef_)

# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(diabetes_y_test, diabetes_y_pred))


In [None]:

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

## EPA Dataset 

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
#Read the data in from somewhere
epa = pd.read_csv('https://raw.githubusercontent.com/sqlshep/SQLShepBlog/master/data/epaMpg.csv')



In [None]:
epa.head(10)

In [None]:
epa.shape

In [None]:
epa.describe()

In [None]:
#Drop the row number
epa = epa.drop(epa.columns[[0]], axis=1)

#replace the "." in the column names with "_"
epa.columns = epa.columns.str.replace('.', '_')

In [None]:
# Drop the first three columns
epa = epa.drop(epa.columns[[0,1,2]], axis=1)
epa

In [None]:
# drop descrition columns
epa = epa.drop(epa.columns[[3,9,11]], axis=1)
epa

### Visualizations 

In [None]:
epa.plot.scatter(x='HorsePower',
                      y='Cylinders',
                      c='DarkBlue',
                figsize=(15,5))

In [None]:
epa.plot.scatter(x='HorsePower',
                      y='Weight',
                      c='DarkBlue',
                figsize=(15,5))

In [None]:
epa.plot.scatter(x='HorsePower',
                      y='FuelEcon',
                      c='DarkBlue',
                figsize=(15,5))

In [None]:
sns.set(rc={'figure.figsize':(12,8)})
sns.heatmap(epa.corr(), annot = True)


In [None]:
# Types of object are character string
epa.dtypes

In [None]:
epa['Tested_Transmission_Type_Code']= epa['Tested_Transmission_Type_Code'].astype('category')
    
epa['Drive_System_Code']= epa['Drive_System_Code'].astype('category')

In [None]:
#One hot encode categories
epa = pd.get_dummies(epa)

In [None]:
print(epa.shape)
epa

In [None]:
# Create the training dataset for scikit learn, you will need all 
# varialbes except the label you are trying to predict
epa_X = epa.iloc[:, epa.columns !='FuelEcon']


In [None]:
# You will also need a dataset the the target varialbe

epa_y = epa.iloc[:, epa.columns =='FuelEcon']

In [None]:
# Split the training and test set 
X_train, X_test, y_train, y_test = train_test_split(epa_X, epa_y, test_size=0.20)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape )

In [None]:
from sklearn import linear_model

In [None]:
epa_lm = linear_model.LinearRegression()

In [None]:
epa_lm.fit(X_train, y_train)

In [None]:
import math 
model_metrics = pd.DataFrame(columns=["Model", "MSE", "RMSE", "R2"])


def metrics(model, y, y_hat):

    model_metrics.loc[-1] = {"Model" : model, 
              "MSE" : mean_squared_error(y, y_hat),
              "RMSE" : math.sqrt(mean_squared_error(y, y_hat)),
              "R2" : r2_score(y, y_hat)}

    model_metrics.index = model_metrics.index + 1
    return model_metrics 

#metrics("PCA Forest",y_test, epa_pca_y_pred)

In [None]:
epa_y_pred = epa_lm.predict(X_test)

In [None]:
metrics("linear_model",y_test, epa_y_pred)

In [None]:
print(epa_lm.coef_)
print(epa_lm.rank_)
print(epa_lm.intercept_ )

In [None]:
#make a prediction

epa_lm.predict(X_test[64:65].to_numpy().tolist())


### Visualize the Error

In [None]:
from sklearn.model_selection import cross_val_predict

# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation:
predicted = cross_val_predict(epa_lm, X_test, y_test, cv=5)

fig, ax = plt.subplots()
ax.scatter(y_test, predicted, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
