In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [13]:
# Import the data
car_data = pd.read_csv("https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_1/datasets/car-data-encoded.csv")
car_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,0,1,0,2.0,0,2,0,88.6,...,130,5,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,0,1,0,2.0,0,2,0,88.6,...,130,5,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,0,1,0,2.0,2,2,0,94.5,...,152,5,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,1,1,0,4.0,3,1,0,99.8,...,109,5,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,1,1,0,4.0,3,0,0,99.4,...,136,5,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [14]:
car_data.shape

(205, 26)

In [15]:
car_data.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [16]:
# Drop rows with missing values
car_data = car_data.dropna()

In [17]:
car_data.shape

(159, 26)

## Split into training and testing sets

In [18]:
# Create a one column X variable with only horsepower
X_one_col = pd.DataFrame(car_data['horsepower'], columns = ['horsepower'])
X_one_col.head()

Unnamed: 0,horsepower
3,102.0
4,115.0
6,110.0
8,140.0
10,101.0


In [19]:
# Create another variable X__multi_col by creating columns
# containing a single value, which are therefore useless to
# the model
import numpy as np
np.random.RandomState(13)

X_multi_col = X_one_col.copy()
X_multi_col['ones'] = 1
X_multi_col['twos'] = 2
X_multi_col['threes'] = 3
X_multi_col['fours'] = 4
X_multi_col['fives'] = 5
X_multi_col['sixes'] = 6
X_multi_col['sevens'] = 7
X_multi_col['eights'] = 8
X_multi_col.head()

Unnamed: 0,horsepower,ones,twos,threes,fours,fives,sixes,sevens,eights
3,102.0,1,2,3,4,5,6,7,8
4,115.0,1,2,3,4,5,6,7,8
6,110.0,1,2,3,4,5,6,7,8
8,140.0,1,2,3,4,5,6,7,8
10,101.0,1,2,3,4,5,6,7,8


In [20]:
# Set the target variable y
y = car_data["price"].values.reshape(-1, 1)

In [21]:
# Now split the data into training and testing sets again
X_one_col_train, X_one_col_test, X_multi_col_train, X_multi_col_test, y_train, y_test = train_test_split(X_one_col, X_multi_col, y, random_state=13)

## Train the models

In [22]:
# Create the models
lr1 = LinearRegression()
lr2 = LinearRegression()

# Fit the first model to the full training data. 
lr1.fit(X_one_col_train, y_train)

# Fit the second model to the select training data.
lr2.fit(X_multi_col_train, y_train)

In [23]:
# Use .coef_ to view the coefficients of the model
# Note the coefficients; the added columns aren't being used!
lr2.coef_

array([[133.47349082,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ]])

In [24]:
lr1.coef_

array([[133.47349082]])

## Evaluate the model

In [25]:
# Calculate the mean_squared_error and the r-squared value
# for the testing data

# Use our models to make predictions
predicted1 = lr1.predict(X_one_col_test)
predicted2 = lr2.predict(X_multi_col_test)


# Score the predictions with mse and r2
mse1 = round(mean_squared_error(y_test, predicted1), 2)
r21  = round(r2_score(y_test, predicted1), 2)
mse2 = round(mean_squared_error(y_test, predicted2), 2)
r22  = round(r2_score(y_test, predicted2), 2)



print(f"All Features:")
print(f"mean squared error (MSE): {mse1}")
print(f"R-squared (R2): {r21}")
print("---------------------")
print(f"Select Features:")
print(f"mean squared error (MSE): {mse2}")
print(f"R-squared (R2): {r22}")
print("---------------------")
print(f"Difference: {r21-r22}")

All Features:
mean squared error (MSE): 15795108.85
R-squared (R2): 0.63
---------------------
Select Features:
mean squared error (MSE): 15795108.85
R-squared (R2): 0.63
---------------------
Difference: 0.0


In [26]:
# Provided code to create the adjusted r-squared function
def r2_adj(x, y, model):
    r2 = model.score(x,y)
    n_cols = x.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)

In [28]:
# Calculate the adjusted r-squared value of the model

#YOUR CODE HERE
adj_score1 = round(r2_adj(X_one_col_test, y_test, lr1), 2)
adj_score2 = round(r2_adj(X_multi_col_test, y_test, lr2), 2)

print(f"All Features Adjusted R2: {adj_score1}")
print(f"Select Features Adjusted R2: {adj_score2}")
print(f"Difference: {round(adj_score1-adj_score2, 2)}")

All Features Adjusted R2: 0.62
Select Features Adjusted R2: 0.52
Difference: 0.1


In [29]:
# Examine linear regression on the better training data using cross validation
from sklearn.model_selection import cross_val_score

# YOUR CODE HERE
cv_scores = cross_val_score(LinearRegression(), X_multi_col_train, y_train, scoring='r2')

print(f"All scores: {cv_scores}")
print(f"Mean score: {cv_scores.mean()}")
print(f"Standard Deviation: {cv_scores.std()}")

All scores: [0.71203723 0.53685971 0.41357888 0.62659487 0.34464322]
Mean score: 0.526742783649534
Standard Deviation: 0.13444191347453338
