In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

### General linear regression on test data

This entire section is a repeat of the initial 'general_regression' notebook.  But having a new testing set of 51 cars, it would have been a waste not to try again on a larger train set and much smaller testing set.

In [9]:
df = pd.read_csv('../data/cleaned.csv')
test = pd.read_csv('../data/test_cleaned.csv')

In [18]:
# from the specific models, we know that these two car models are not in our dataframe
# dropping them ensure no astronomical values in our linear regression

test = test[ (test['model'] != 'LR2') & (test['model'] != 'Sonata Plug-in Hybrid')]

In [19]:
# attach the test dataframe to the cleaned dataframe in order to ensure all dummy categories match
# then remove, fit on train data, and make predictions on test (temp_X)
temp = pd.concat([df, test], ignore_index=True)
dummies = pd.get_dummies(temp, columns=['manufacturer', 'model', 'trim', 'color'], drop_first=True)
temp = dummies.tail(49)
temp_X = temp.drop(columns=['name', 'price'])
dummies = dummies.head(df.shape[0])
X = dummies.drop(columns=['name', 'price'])
y = dummies['price']
lr = LinearRegression()
lr.fit(X, y)
pred = lr.predict(temp_X)

In [20]:
test['prediction'] = pred

In [21]:
metrics.mean_squared_error(test['price'], test['prediction'], squared=False)

2750.0142180276566

In [24]:
# r2 over training data
lr.score(X, y)

0.9590749539400443

In [23]:
# r2 over test data
lr.score(temp_X, test['price'])

0.9067626877164874

### General decision tree on test data

In [37]:
# same two models dropped, although the algorithm does work (just not nearly as well) with the two rogue cars
df = pd.read_csv('../data/cleaned.csv')
test = pd.read_csv('../data/test_cleaned.csv')
test = test[ (test['model'] != 'LR2') & (test['model'] != 'Sonata Plug-in Hybrid')]

In [38]:
temp = pd.concat([df, test], ignore_index=True)
dummies = pd.get_dummies(temp, columns=['manufacturer', 'model', 'trim', 'color'], drop_first=True)
temp = dummies.tail(49) # or 51 if not dropping two cars
temp_X = temp.drop(columns=['name', 'price'])
dummies = dummies.head(df.shape[0])
X = dummies.drop(columns=['name', 'price'])
y = dummies['price']
dt = DecisionTreeRegressor()
dt.fit(X, y)
pred = dt.predict(temp_X)

In [39]:
test['prediction'] = pred

In [40]:
metrics.mean_squared_error(test['price'], test['prediction'], squared=False)

5250.850271282601

In [41]:
# r2 over training data
dt.score(X, y)

1.0

In [42]:
# r2 over test data
dt.score(temp_X, test['price'])

0.6600781072205841

### Random forest on test data

In [43]:
df = pd.read_csv('../data/cleaned.csv')
test = pd.read_csv('../data/test_cleaned.csv')
test = test[ (test['model'] != 'LR2') & (test['model'] != 'Sonata Plug-in Hybrid')]

In [44]:
temp = pd.concat([df, test], ignore_index=True)
dummies = pd.get_dummies(temp, columns=['manufacturer', 'model', 'trim', 'color'], drop_first=True)
temp = dummies.tail(49)
temp_X = temp.drop(columns=['name', 'price'])
dummies = dummies.head(df.shape[0])
X = dummies.drop(columns=['name', 'price'])
y = dummies['price']
rf = RandomForestRegressor()
rf.fit(X, y)
pred = rf.predict(temp_X)

In [45]:
test['prediction'] = pred

In [34]:
metrics.mean_squared_error(test['price'], test['prediction'], squared=False)

3342.131436358528

In [35]:
# r2 over training data
rf.score(X, y)

0.985610713110797

In [36]:
# r2 over test data
rf.score(temp_X, test['price'])

0.8622895434056617