In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score

### Linear regression specific to car model

In [87]:
df = pd.read_csv('../data/cleaned.csv')
test = pd.read_csv('../data/test_cleaned.csv')

In [88]:
# list comprehension below taken from https://stackoverflow.com/questions/23691133/split-pandas-dataframe-based-on-groupby

gb = df.groupby('model')    
group_list = [gb.get_group(x) for x in gb.groups]

In [89]:
# create model_list for indexing purposes

model_list = []
for df in group_list: 
    model_list.append('_'.join(df.iat[0, 3].split()))

In [90]:
# attempt to concat single row, remove, fit model, and get preds
def row_prediction(row):
    try:
        row_as_frame = row.to_frame()
  
        temp = pd.concat([group_list[model_list.index('_'.join(row_as_frame.T.iat[0, 3].split()))], row_as_frame.T], ignore_index=True)
    
        dummies = pd.get_dummies(temp, columns=['manufacturer', 'model', 'trim', 'color'], drop_first=True)
        temp = dummies.tail(1)
        temp_X = temp.drop(columns=['name', 'price'])
        dummies.drop(index=dummies.index[-1], inplace=True)
        X = dummies.drop(columns=['name', 'price'])
        y = dummies['price']
        lr = LinearRegression()
        lr.fit(X, y)
        pred = lr.predict(temp_X)
        return pred[0]
    except:
        return 'Model not in dataframe.'

In [91]:
df = pd.read_csv('../data/cleaned.csv')
test = pd.read_csv('../data/test_cleaned.csv')

In [92]:
test['prediction'] = test.apply(row_prediction, axis=1)

In [93]:
# convert 'Model not in dataframe' to NaN to assist in dropping those two rows
# code taken from https://stackoverflow.com/questions/54938727/python-pandas-drop-rows-based-on-a-columns-data-type
test['prediction'] = test[['prediction']].apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [94]:
test.dropna(inplace=True)

In [95]:
metrics.mean_squared_error(test['price'], test['prediction'], squared=False)

2292.684317055039

### Decision tree specific to car model

In [78]:
from sklearn.tree import DecisionTreeRegressor

In [79]:
df = pd.read_csv('../data/cleaned.csv')
test = pd.read_csv('../data/test_cleaned.csv')

In [80]:
def row_prediction(row):
    try:
        row_as_frame = row.to_frame()

        temp = pd.concat([group_list[model_list.index('_'.join(row_as_frame.T.iat[0, 3].split()))], row_as_frame.T], ignore_index=True)

        dummies = pd.get_dummies(temp, columns=['manufacturer', 'model', 'trim', 'color'], drop_first=True)
        temp = dummies.tail(1)
        temp_X = temp.drop(columns=['name', 'price'])
        dummies.drop(index=dummies.index[-1], inplace=True)
        X = dummies.drop(columns=['name', 'price'])
        y = dummies['price']
        dt = DecisionTreeRegressor()
        dt.fit(X, y)
        pred = dt.predict(temp_X)
        return pred[0]
    except:
        return 'Model not in dataframe.'

In [81]:
test['prediction'] = test.apply(row_prediction, axis=1)

In [82]:
test['prediction'] = test[['prediction']].apply(lambda x: pd.to_numeric(x, errors='coerce'))
test.dropna(inplace=True)
metrics.mean_squared_error(test['price'], test['prediction'], squared=False)

3342.7350405050665

### Random forest specific to car model

In [52]:
from sklearn.ensemble import RandomForestRegressor

In [83]:
df = pd.read_csv('../data/cleaned.csv')
test = pd.read_csv('../data/test_cleaned.csv')

In [84]:
def row_prediction(row):
    try:
        row_as_frame = row.to_frame()

        temp = pd.concat([group_list[model_list.index('_'.join(row_as_frame.T.iat[0, 3].split()))], row_as_frame.T], ignore_index=True)

        dummies = pd.get_dummies(temp, columns=['manufacturer', 'model', 'trim', 'color'], drop_first=True)

        temp = dummies.tail(1)
        temp_X = temp.drop(columns=['name', 'price'])
        dummies.drop(index=dummies.index[-1], inplace=True)
        X = dummies.drop(columns=['name', 'price'])
        y = dummies['price']
        rf = RandomForestRegressor()
        rf.fit(X, y)
        pred = rf.predict(temp_X)
        return pred[0]
    except:
        return 'Model not in dataframe.'

In [85]:
test['prediction'] = test.apply(row_prediction, axis=1)

In [86]:
test['prediction'] = test[['prediction']].apply(lambda x: pd.to_numeric(x, errors='coerce'))
test.dropna(inplace=True)
metrics.mean_squared_error(test['price'], test['prediction'], squared=False)

2062.6659389029055

In [68]:
test

Unnamed: 0,name,year,manufacturer,model,trim,mileage,color,price,prediction
0,2016 Acura RDX,2016,Acura,RDX,base,30066,Blue,28590,27118.0
1,2014 Acura RLX,2014,Acura,RLX,base,62517,White,24990,24376.0
2,2013 Audi Q5,2013,Audi,Q5,2.0T Premium Plus,70260,Black,22990,22040.0
3,2019 BMW i3,2019,BMW,i3,Base w/Range Extender,23542,Black,34990,34086.0
4,2015 Cadillac CTS,2015,Cadillac,CTS,2.0 Luxury Collection,42612,White,26990,26364.0
5,2016 Cadillac CTS,2016,Cadillac,CTS,2.0 Luxury Collection,55637,Black,26990,25916.0
6,2020 Cadillac XT5,2020,Cadillac,XT5,Premium Luxury,65319,Gray,32590,31134.0
7,2011 Chevrolet Camaro,2011,Chevrolet,Camaro,LT,36861,Red,21990,23952.0
8,2019 Chevrolet Colorado Extended Cab,2019,Chevrolet,Colorado Extended Cab,Work Truck 6 ft,53973,Blue,24590,24777.6
9,2015 Chevrolet Cruze,2015,Chevrolet,Cruze,1LT,62484,White,15990,15826.0
