## Import the required packages

In [None]:
# importing required packages
# data operation libraries

import numpy as np
import pandas as pd
# visualization libraries

import matplotlib.pyplot as plt
import seaborn as sns
# For regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from mpl_toolkits.mplot3d import axes3d

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline
plt.style.use('seaborn-white')

# the blow code hide the python warnings
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

## Load the dataset 

In [None]:
#Read the csv file
dataset1 = pd.read_csv(r'dataset/auto-mpg.csv')

dataset1

In [None]:
#checking the data types of each columns
dataset1.dtypes

## Replacing special characters values to null

Few columns in the dataset contains special characters like . and ? as a value. Inorder to convert the horsepower column to float value we have to replace such values to null. 

In [None]:
#confirming the dataset has all valid values. In the displayed value there is a "?" in the data. Same do 
#for all the columns
np.unique(dataset1.horsepower)

In [None]:
dot_mask = dataset1 == '?'
dot_mask

In [None]:
dataset1.columns

In [None]:
# Iterate through True values in the mask
for row, col in zip(*dot_mask.to_numpy().nonzero()):
    print(f"Row {row}, Column '{dataset1.columns[col]}' contains '?'")


In [None]:
# we found "?" and "." values inside the column. 
# First we replace them with null values then we drop it using dropna()
from numpy import nan

#dataset1.replace('.', nan, inplace=True)
dataset1.replace('?', nan, inplace=True)

#dataset1 = dataset1.replace('?', nan)

dataset1 = dataset1.dropna()
dataset1.head()

In [None]:
dataset1

In [None]:
dataset1.dtypes

In [None]:
# the column horsepower contains float values. However its datatype is showing object. 
#A datatype of object typically means that the column contains
#string values, mixed data types, or even non-standard Python objects.
#in this case since the "?" was there in the value it become object. Now we have removed it
#So we are going to convert that column to float
dataset1["horsepower"] = dataset1["horsepower"].astype(float)
dataset1.dtypes

In [None]:
dataset1.horsepower

In [None]:
dataset1.mpg

In [None]:
# With Seaborn's regplot() you can easily plot higher order polynomials.
# By deafult if you havent specify any order it will plot a linear model with order=1.
#order: This parameter specifies the order of the polynomial regression
# REGPLOT()::used to plot data and a linear regression model fit. I
#It combines a scatter plot of the data points with a regression line representing the relationship between two variables.
# Here facecolor= None means that the markers will have no interior fill and will appear as hollow shapes. 
# Here edgecolors='k' means The edges of the markers are black. 
plt.scatter(dataset1.horsepower, dataset1.mpg, facecolors='None', edgecolors='k', alpha=.5) 
sns.regplot(x=dataset1.horsepower, y=dataset1.mpg, ci=None, label='Linear', scatter=False, color='orange')
sns.regplot(x=dataset1.horsepower, y=dataset1.mpg, ci=None, label='Degree 2', order=2, scatter=False, color='lightblue')
sns.regplot(x=dataset1.horsepower, y=dataset1.mpg, ci=None, label='Degree 5', order=5, scatter=False, color='g')
#sns.regplot(x=dataset1.horsepower, y=dataset1.mpg, ci=None, label='Degree 70', order=70, scatter=False, color='r')
plt.legend()
plt.ylim(5,55)
plt.xlim(40,240);


# mpg = b0 + b1*hp
# mpg= bo + b1 * hp + b2 *hp2
# mpg = bo + b1 * hp + b2 *hp2 + b3 * hp3

In [None]:
# we are creating the new columns or new predictors
#dataset1['horsepower2'] = dataset1.horsepower**2
dataset1['horsepower2'] =dataset1.loc[:, "horsepower"]**2
#dataset1.loc[:, "horsepower2"] = dataset1.loc[:, "horsepower"]**2
dataset1.head(3)

## Using statsmodel package

In [None]:
est = smf.ols('mpg ~ horsepower ', dataset1).fit()
est.summary().tables[1]
est.summary()

In [None]:
est = smf.ols('mpg ~ horsepower + horsepower2', dataset1).fit()
est.summary().tables[1]
#est.summary()

In [None]:
est.summary()

In [None]:
#With degree 3
#dataset1['horsepower2'] = dataset1.horsepower**2
dataset1['horsepower3'] = dataset1.loc[:, "horsepower"]**3
#dataset1.loc[:, "horsepower2"] = dataset1.loc[:, "horsepower"]**2
dataset1.head(3)

In [None]:
est1 = smf.ols('mpg ~ horsepower +horsepower2 + horsepower3', dataset1).fit()
est1.summary().tables[1]
est1.summary()

In [None]:
est2 = smf.ols('mpg ~ horsepower + horsepower2 + weight', dataset1).fit()
est2.summary()


In [None]:
dataset1.columns

In [None]:
est2 = smf.ols('mpg ~ horsepower + horsepower2  + displacement + acceleration ', dataset1).fit()
est2.summary()


## Using scikitlearn package

In [None]:
regr = skl_lm.LinearRegression()

# Linear fit mpg= b0+ hp *b1
X = dataset1.horsepower.values.reshape(-1,1)
y = dataset1.mpg
regr.fit(X, y)

dataset1['pred1'] = regr.predict(X)
dataset1['resid1'] = dataset1.mpg - dataset1.pred1

# Quadratic fit  mpg= b0+ hp *b1 + b2 * hp^2
X2 = dataset1[['horsepower', 'horsepower2']].values
regr.fit(X2, y)

dataset1['pred2'] = regr.predict(X2)
dataset1['resid2'] = dataset1.mpg - dataset1.pred2




In [None]:
dataset1

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,5))

# Left plot
sns.regplot(x=dataset1.pred1, y=dataset1.resid1, lowess=True, 
            ax=ax1, line_kws={'color':'r', 'lw':1},
            scatter_kws={'facecolors':'None', 'edgecolors':'k', 'alpha':0.5})
ax1.hlines(0,xmin=ax1.xaxis.get_data_interval()[0],
           xmax=ax1.xaxis.get_data_interval()[1], linestyles='dotted')
ax1.set_title('Residual Plot for Linear Fit')

# Right plot
sns.regplot(x=dataset1.pred2, y=dataset1.resid2, lowess=True,
            line_kws={'color':'r', 'lw':1}, ax=ax2,
            scatter_kws={'facecolors':'None', 'edgecolors':'k', 'alpha':0.5})
ax2.hlines(0,xmin=ax2.xaxis.get_data_interval()[0],
           xmax=ax2.xaxis.get_data_interval()[1], linestyles='dotted')
ax2.set_title('Residual Plot for Quadratic Fit')

for ax in fig.axes:
    ax.set_xlabel('Fitted values')
    ax.set_ylabel('Residuals')

In [None]:
#Here in the above plot instead of mpg vs horsepower we plotted the predicted vs residuals(actual-predicted)
#in the first subplot we used only linear model, there you can see the predicted vs residual form a pattern. 
# It means that it is not representing the data very well. 
# In the second subplot we used horspower^2 as one of the predictor then it bcome a polynomial regression
# In that case if you plot the predicted vs residual, there is no pattern. all the values are scattered. Then it shows a good fit.