In [1]:
from scipy import stats
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from dask_ml.model_selection import train_test_split
import joblib
import dask.array as da
import dask.dataframe as dd
from dask_ml.preprocessing import DummyEncoder
from dask_ml.linear_model import LinearRegression
from dask_ml.metrics import r2_score

from dask.distributed import Client, progress

client = Client(n_workers=4, threads_per_worker=2, memory_limit='2GB')
client



0,1
Client  Scheduler: tcp://127.0.0.1:62926  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 8.00 GB


In [2]:
cars_df = dd.read_csv('/Users/steveangeli/Downloads/vehicles.csv')

In [3]:
cars_df.head()

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,drive,size,type,paint_color,image_url,description,county,state,lat,long
0,7034441763,https://saltlakecity.craigslist.org/cto/d/salt...,salt lake city,https://saltlakecity.craigslist.org,17899,2012.0,volkswagen,golf r,excellent,4 cylinders,...,4wd,compact,hatchback,black,https://images.craigslist.org/00G0G_fTLDWM5Xyv...,PRICE REDUCED! -Garage kept -Low Miles (63K)...,,ut,40.7372,-111.858
1,7034440610,https://saltlakecity.craigslist.org/ctd/d/sand...,salt lake city,https://saltlakecity.craigslist.org,0,2016.0,ford,f-150,excellent,,...,4wd,,,,https://images.craigslist.org/00v0v_7Cu0buIofU...,Drive it home today. Call (Or Text) us now !!C...,,ut,40.5881,-111.884
2,7034440588,https://saltlakecity.craigslist.org/ctd/d/sand...,salt lake city,https://saltlakecity.craigslist.org,46463,2015.0,gmc,sierra 1500,excellent,,...,4wd,,,white,https://images.craigslist.org/01515_lPvJ9bfbdY...,Drive it home today. Call (Or Text) us now !!C...,,ut,40.5881,-111.884
3,7034440546,https://saltlakecity.craigslist.org/ctd/d/sand...,salt lake city,https://saltlakecity.craigslist.org,0,2016.0,ford,f-150,excellent,,...,4wd,,,,https://images.craigslist.org/00T0T_6Rjfp3NS4O...,Drive it home today. Call (Or Text) us now !!C...,,ut,40.5881,-111.884
4,7034406932,https://saltlakecity.craigslist.org/ctd/d/evan...,salt lake city,https://saltlakecity.craigslist.org,49999,2018.0,ford,f-450,,,...,4wd,,pickup,white,https://images.craigslist.org/00W0W_8yIUwRBXXd...,2018 Ford F-350 F350 F 350 SD Lariat Crew Cab ...,,ut,40.3744,-104.694


In [4]:
# Drop some columns that I don't believe wil have much impact on the sale price or that have too many values.
cars_df = cars_df.drop(columns=['url', 'region_url', 'vin', 'image_url', 'description', 'county', 'lat', 'long', 'id', 'region', 'state', 'model', 'paint_color', 'size', 'drive', 'cylinders', 'fuel', 'transmission'])



In [5]:
cars_df.head()

Unnamed: 0,price,year,manufacturer,condition,odometer,title_status,type
0,17899,2012.0,volkswagen,excellent,63500.0,clean,hatchback
1,0,2016.0,ford,excellent,10.0,clean,
2,46463,2015.0,gmc,excellent,7554.0,clean,
3,0,2016.0,ford,excellent,10.0,clean,
4,49999,2018.0,ford,,70150.0,clean,pickup


In [6]:
cars_df = cars_df[cars_df['price'] > 1000]
cars_df = cars_df[cars_df['price'] < 100000]
cars_df = cars_df[cars_df['year'] < 2019]
cars_df = cars_df[cars_df['odometer'] < 300000]
cars_df = cars_df[cars_df['odometer'] > 5000]
cars_df = cars_df[cars_df['year'] > 2004]

len(cars_df)

303044

In [7]:
# Drop any null rows because I still should have plenty of data to work with. 
cars_df = cars_df.dropna()
cars_df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 7 entries, price to type
dtypes: object(4), float64(2), int64(1)

In [8]:
non_numeric_columns = cars_df.select_dtypes(['object']).columns
numeric_columns = cars_df.select_dtypes(['int64', 'float64']).columns

In [9]:
cars_df = cars_df.categorize(columns=non_numeric_columns)
de = DummyEncoder()
trn_cars_df = de.fit_transform(cars_df)
trn_cars_df.head()

Unnamed: 0,price,year,odometer,manufacturer_volkswagen,manufacturer_honda,manufacturer_mercedes-benz,manufacturer_bmw,manufacturer_ram,manufacturer_nissan,manufacturer_subaru,...,type_SUV,type_truck,type_convertible,type_pickup,type_wagon,type_van,type_offroad,type_mini-van,type_bus,type_other
0,17899,2012.0,63500.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24,4600,2008.0,110982.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52,18999,2015.0,37000.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53,79997,2016.0,28000.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54,18999,2013.0,65000.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
Y = da.log(trn_cars_df['price'])
X = trn_cars_df.drop(['price'], axis=1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [12]:
lr = LinearRegression(penalty='l2')
lr.fit(X_train.values.compute(), y_train.values.compute())

preds_train = lr.predict(X_train.values.compute())
preds_test = lr.predict(X_test.values.compute())

preds_train=da.asarray(preds_train)
preds_train.compute()
preds_test=da.asarray(preds_test)
preds_test.compute()

y_train = y_train.values.compute()
y_train = da.asarray(y_train)
y_train.compute()

y_test = y_test.values.compute()
y_test = da.asarray(y_test)
y_test.compute()

print("Training score is: ", r2_score(y_train, preds_train))
print("Test score is: ", r2_score(y_test, preds_test))

Training score is:  0.7187854334276103
Test score is:  0.7188008540575616
