In [1]:
import numpy as np   
import pandas as pd    
import matplotlib.pyplot as plt   
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler  
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.decomposition import PCA

In [2]:
mpg_df = pd.read_csv("car-mpg.csv")  
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mpg       398 non-null    float64
 1   cyl       398 non-null    int64  
 2   disp      398 non-null    float64
 3   hp        398 non-null    object 
 4   wt        398 non-null    int64  
 5   acc       398 non-null    float64
 6   yr        398 non-null    int64  
 7   origin    398 non-null    int64  
 8   car_type  398 non-null    int64  
 9   car_name  398 non-null    object 
dtypes: float64(3), int64(5), object(2)
memory usage: 31.2+ KB


In [3]:
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df = mpg_df.replace('?', np.nan)
#mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)
mpg_df['hp'].fillna(mpg_df['hp'].median(), inplace=True)
mpg_df['hp'] = mpg_df['hp'].astype('float64')

In [4]:
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mpg             398 non-null    float64
 1   cyl             398 non-null    int64  
 2   disp            398 non-null    float64
 3   hp              398 non-null    float64
 4   wt              398 non-null    int64  
 5   acc             398 non-null    float64
 6   yr              398 non-null    int64  
 7   car_type        398 non-null    int64  
 8   origin_america  398 non-null    uint8  
 9   origin_asia     398 non-null    uint8  
 10  origin_europe   398 non-null    uint8  
dtypes: float64(4), int64(4), uint8(3)
memory usage: 26.2 KB


# separate independent and dependent variables

In [5]:
# Separate Xs and Y
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [6]:
# Scale the Xs

scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)  
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [7]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=1)

# fit a simple linear model

In [8]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

Intercept:  [23.66510774]
The coefficient for cyl is 2.5059518049385052
The coefficient for disp is 2.535708286056052
The coefficient for hp is -1.7889335736325254
The coefficient for wt is -5.551819873098726
The coefficient for acc is 0.11485734803440907
The coefficient for yr is 2.931846548211611
The coefficient for car_type is 2.977869737601943
The coefficient for origin_america is -0.5832955290165979
The coefficient for origin_asia is 0.3474931380432245
The coefficient for origin_europe is 0.37741646808688323


In [9]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.8343770256960538
0.8513421387780066


## Iteration 2 - Linear regression 
### Understand Rule for Dummy Variable Regression
### Remove 1 Dummy variable


In [10]:

#X_train = X_train.drop('origin_europe', axis=1)
#X_test = X_test.drop('origin_europe', axis=1)

X_train = X_train.drop('origin_america', axis=1)
X_test = X_test.drop('origin_america', axis=1)

In [11]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

Intercept:  [23.66510774]
The coefficient for cyl is 2.5059518049385003
The coefficient for disp is 2.535708286056051
The coefficient for hp is -1.788933573632526
The coefficient for wt is -5.551819873098725
The coefficient for acc is 0.11485734803440664
The coefficient for yr is 2.9318465482116074
The coefficient for car_type is 2.977869737601942
The coefficient for origin_asia is 0.8282270142957202
The coefficient for origin_europe is 0.8362781383948806


In [12]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

0.8343770256960538
0.8513421387780067


## IMPORTANT NOTE
#### The above results show that one less variable should be  defined for dummy variables
#### This can be achieved using following code (drop_first=True)
**----------------------------------------------------------------------------------------------**

mpg_df = pd.get_dummies(mpg_df, columns=['origin'], drop_first=True)

**----------------------------------------------------------------------------------------------**

# PCA

In [13]:
pca = PCA()
X_train_proj = pca.fit_transform(X_train)
X_test_proj = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.58033789, 0.14591122, 0.10687254, 0.07833465, 0.0441282 ,
       0.02739741, 0.00719863, 0.00705853, 0.00276093])

### Decide how many compoenents to choose

In [14]:
pca = PCA(n_components=8)
X_train_proj = pca.fit_transform(X_train)
X_test_proj = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.58033789, 0.14591122, 0.10687254, 0.07833465, 0.0441282 ,
       0.02739741, 0.00719863, 0.00705853])

In [15]:
X_train_proj = pd.DataFrame(X_train_proj)
regression_model = LinearRegression()
regression_model.fit(X_train_proj, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X_train_proj.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

Intercept:  [23.60071942]
The coefficient for 0 is -2.9052283475714455
The coefficient for 1 is -0.9653168154254849
The coefficient for 2 is 0.47596256811608567
The coefficient for 3 is -2.1270719000164378
The coefficient for 4 is -0.0064308216544859045
The coefficient for 5 is -1.5368682331538936
The coefficient for 6 is -6.232464748757214
The coefficient for 7 is -2.7714929360124714


In [16]:
print(regression_model.score(X_train_proj, y_train))
print(regression_model.score(X_test_proj, y_test))


0.8333537470215796
0.8478244288876238
