In [23]:
import pandas as pd
df = pd.read_csv('auto_mpg.csv')
df


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


# Data Properties


In [24]:
# Let us use DataFrame.info() to get to know about the data properties.

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


# Dropping null values


In [25]:
df.dropna(inplace = True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


# Predictors and Target
The target variable is 'mpg' which has to be predicted. The predictors are the variables that are used to predict the target. Here, except name of the car, all the other variables are included as predictors.



In [26]:
#Creating matrix of predictors
X = df.iloc[:, 1:8]
#Creating target
y = df.iloc[:, 0]


In [27]:
#   Since the origin feature is a categorical variable,  get_dummies function can be used from Pandas to encode it as shown below:


X = pd.get_dummies(X)
X


Unnamed: 0,cylinders,displacement,weight,acceleration,model year,origin,horsepower_100,horsepower_102,horsepower_103,horsepower_105,...,horsepower_90,horsepower_91,horsepower_92,horsepower_93,horsepower_94,horsepower_95,horsepower_96,horsepower_97,horsepower_98,horsepower_?
0,8,307.0,3504,12.0,70,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,350.0,3693,11.5,70,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8,318.0,3436,11.0,70,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,304.0,3433,12.0,70,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,302.0,3449,10.5,70,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,4,140.0,2790,15.6,82,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
394,4,97.0,2130,24.6,82,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
395,4,135.0,2295,11.6,82,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
396,4,120.0,2625,18.6,82,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# The data must be divided into two parts. First, a training set on which model can be trained. Second, a testing set on which the model can be validated.



In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


# Since all the variables in the data are with different units of measurements and different scales, it would be a good idea to standardize them. A standard scaler performs this operation by transforming the columns such that the mean of every column or variable is 0 and standard deviation is 1.



In [29]:
#Applying standard scaler on the data
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit_transform(X_train)
scale.transform(X_test);


The linear regression model is used to build the model. A linear regression model uses the following equation:

y = B0 + B1*X1 + B2*X2 + _ _ _ _ + Bn*Xn

In this case, y refers to the target and X1,X2…..Xn refer to the predictors. B0 is the intercept  and B1,B2…..Bn are the coefficients.

Below code demonstrates the Linear Regression model building using sklearn library on the training data set. 

In [30]:
#Importing and fitting the model on training set
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
#Fitting the model on training data :
reg.fit(X_train, y_train)
#Checking the coefficient(slope) and intercept. 
#'m' represents the coefficient and 'c' represents the intercept.
m = reg.coef_
c = reg.intercept_
m,c


(array([ 1.23175939e-01,  2.61040807e-03, -3.61911050e-03, -2.01873399e-01,
         6.87842053e-01,  1.40141610e+00,  1.19870727e+10,  1.19870727e+10,
         1.19870727e+10,  1.19870727e+10,  2.23378291e+10,  1.19870727e+10,
         1.19870727e+10,  1.19870727e+10,  1.19870727e+10,  1.19870727e+10,
        -2.65638284e+10,  1.19870727e+10,  1.19870727e+10,  1.19870727e+10,
         1.19870727e+10,  1.19870727e+10,  1.19870727e+10,  1.19870727e+10,
         1.19870727e+10,  1.19870727e+10,  1.19870727e+10,  1.19870727e+10,
         1.19870727e+10,  1.19870727e+10,  1.19870727e+10, -8.00505075e+07,
         1.19870727e+10,  1.19870727e+10,  1.19870727e+10,  1.19870727e+10,
         1.03291645e+06,  1.19870727e+10,  1.19870727e+10,  1.19870727e+10,
         1.19870727e+10,  1.19870727e+10,  1.19870727e+10,  1.19870727e+10,
         1.19870727e+10,  1.19870727e+10, -7.26508841e+01,  1.07634823e+01,
         1.19870727e+10,  1.19870727e+10,  1.19870727e+10, -9.03117657e-03,
         1.1

In the next step, the linear regression model created is used for prediction against the training and testing data set.



In [31]:
#Predicting the target: mpg against the predictors in the training data set
#Predicted data stored in y_pred_train
y_pred_train = reg.predict(X_train)
#Predicting the target: mpg against the predictors in the testing data set
#Predicted data stored in y_pred_test
y_pred_test = reg.predict(X_test)


There are different metrics used to evaluate the performance of the model. Here the R Square score is used.



In [32]:
# Prediction Accuracy in terms of how close is the predicted value of target: mpg
# to the real value in training data set
from sklearn.metrics import r2_score
r2_S = r2_score(y_train, y_pred_train)
r2_S


0.9060446964021497

In [33]:
# Prediction Accuracy in terms of how close the predicted value of target: mpg
# to the real value in testing data set
from sklearn.metrics import r2_score
r2_S = r2_score(y_test, y_pred_test)
r2_S


-6.821813516727603e+17