In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [24]:
# Import the data
df = pd.read_csv("https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_1/datasets/car-data-encoded.csv").dropna()
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,2,164.0,1,1,0,4.0,3,1,0,99.8,...,109,5,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,1,1,0,4.0,3,0,0,99.4,...,136,5,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
6,1,158.0,1,1,0,4.0,3,1,0,105.8,...,136,5,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0
8,1,158.0,1,1,1,4.0,3,1,0,105.8,...,131,5,3.13,3.4,8.3,140.0,5500.0,17,20,23875.0
10,2,192.0,2,1,0,2.0,3,2,0,101.2,...,108,5,3.5,2.8,8.8,101.0,5800.0,23,29,16430.0


In [26]:
df['body-style'].value_counts()

body-style
3    79
2    56
4    17
1     5
0     2
Name: count, dtype: int64

In [11]:
# Get the features (everything except the "price" column)
X = df.copy().drop(columns="price")
X.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
3,2,164.0,1,1,0,4.0,3,1,0,99.8,...,4,109,5,3.19,3.4,10.0,102.0,5500.0,24,30
4,2,164.0,1,1,0,4.0,3,0,0,99.4,...,5,136,5,3.19,3.4,8.0,115.0,5500.0,18,22
6,1,158.0,1,1,0,4.0,3,1,0,105.8,...,5,136,5,3.19,3.4,8.5,110.0,5500.0,19,25
8,1,158.0,1,1,1,4.0,3,1,0,105.8,...,5,131,5,3.13,3.4,8.3,140.0,5500.0,17,20
10,2,192.0,2,1,0,2.0,3,2,0,101.2,...,4,108,5,3.5,2.8,8.8,101.0,5800.0,23,29


In [12]:
# Get the target column
y = df["price"].values.reshape(-1,1)
y[0:5]

array([[13950.],
       [17450.],
       [17710.],
       [23875.],
       [16430.]])

In [13]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [14]:
# Create a function to calculate VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [17]:
# Calculate vif for the dataframe
calc_vif(X).sort_values(by='VIF')

  return 1 - self.ssr/self.uncentered_tss


Unnamed: 0,variables,VIF
4,aspiration,4.864129
0,symboling,5.581946
17,fuel-system,9.244786
2,make,11.157449
7,drive-wheels,19.063074
14,engine-type,24.333723
1,normalized-losses,33.115809
5,num-of-doors,34.431202
6,body-style,35.352569
21,horsepower,187.255117


In [19]:
# Investigate the engine-location column to see why it returned a VIF of NaN
X['engine-location'].value_counts()

engine-location
0    159
Name: count, dtype: int64

In [20]:
# Create another X variable by dropping engine-location 
# and the 4 columns with the highest VIF scores
X_vif = X.drop(columns=['engine-location', 'width', 'wheel-base', 'length', 'height'], axis=1)

# Recalculate the VIF scores
calc_vif(X_vif).sort_values(by='VIF')

Unnamed: 0,variables,VIF
4,aspiration,4.006607
0,symboling,4.290569
12,fuel-system,8.421521
2,make,10.483171
7,drive-wheels,17.785714
9,engine-type,19.227824
6,body-style,28.723936
1,normalized-losses,30.085193
5,num-of-doors,31.986812
16,horsepower,157.051644


In [27]:
# Split the data into training and testing sets
X_full_train, X_full_test, X_vif_train, X_vif_test, y_train, y_test = train_test_split(X, X_vif,y,random_state=14)

In [28]:
# Train two models using the different X variables

# Create the models
lr1 = LinearRegression()
lr2 = LinearRegression()


# Fit the models
lr1.fit(X_full_train, y_train)
lr2.fit(X_vif_train, y_train)

In [30]:
# Provided code to create the adjusted r-squared function
def r2_adj(x, y, model):
    r2 = model.score(x,y)
    n_cols = x.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)

In [34]:
train_adj_score1 = r2_adj(X_full_train, y_train, lr1)
train_adj_score2 = r2_adj(X_vif_train, y_train, lr2)

print(train_adj_score1)
print()
print(train_adj_score2)

0.865147102182799

0.8581927171285877


In [33]:
# Compare the adjusted r-squared of the two models
adj_score1 = r2_adj(X_full_test, y_test, lr1)
adj_score2 = r2_adj(X_vif_test, y_test, lr2)

print(adj_score1)
print()
print(adj_score2)

0.4460851778424917

0.6387757232749596
