In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import coremltools
from sklearn import datasets, linear_model, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer

In [2]:
df = pd.read_csv('/Users/shrav/Downloads/used-cars-database/autos.csv', sep=',', header=0, encoding='cp1252')
df.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [3]:
valid_models = df.dropna()

In [4]:
#### Removing the duplicates
dedups = valid_models.drop_duplicates(['name','seller','offerType','price','abtest','vehicleType','yearOfRegistration'
                         ,'gearbox','powerPS','model','kilometer','monthOfRegistration','fuelType'
                         ,'notRepairedDamage','postalCode'])

In [5]:
#### Removing the outliers
no_outliers = dedups[
        (valid_models.yearOfRegistration <= 2016) 
      & (valid_models.yearOfRegistration >= 1950) 
      & (valid_models.price >= 100) 
      & (valid_models.price <= 150000) 
      & (valid_models.powerPS >= 10) 
      & (valid_models.powerPS <= 500)]

rel_cols = no_outliers[['price'
                        ,'yearOfRegistration'
                        ,'gearbox'
                        ,'powerPS'
                        ,'model'
                        ,'kilometer'
                        ,'fuelType'
                        ,'vehicleType'
                        ,'brand'
                        ,'notRepairedDamage']]
rel_cols.head()

  


Unnamed: 0,price,yearOfRegistration,gearbox,powerPS,model,kilometer,fuelType,vehicleType,brand,notRepairedDamage
3,1500,2001,manuell,75,golf,150000,benzin,kleinwagen,volkswagen,nein
4,3600,2008,manuell,69,fabia,90000,diesel,kleinwagen,skoda,nein
5,650,1995,manuell,102,3er,150000,benzin,limousine,bmw,ja
6,2200,2004,manuell,109,2_reihe,150000,benzin,cabrio,peugeot,nein
10,2000,2004,manuell,105,3_reihe,150000,benzin,limousine,mazda,nein


In [6]:
rel_cols.to_csv('/Users/shrav/Downloads/used-cars-database/cars.csv')

In [7]:
labels = ['gearbox', 'notRepairedDamage', 'model', 'brand', 'fuelType', 'vehicleType']
les = {}

for l in labels:
    les[l] = preprocessing.LabelEncoder()
    les[l].fit(rel_cols[l])
    #print(les[l].classes_)
    tr = les[l].transform(rel_cols[l]) 
    rel_cols.loc[:, l + ''] = pd.Series(tr, index=rel_cols.index)

df_autos = rel_cols[ ['price'
                        ,'yearOfRegistration'
                        ,'powerPS'
                        ,'kilometer'] 
                    + [x+"" for x in labels]]
autos = df_autos.values.astype(float)

Y = autos[:,0]
X = autos[:,1:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [8]:
autos[:,]

array([[  1.50000000e+03,   2.00100000e+03,   7.50000000e+01, ...,
          3.70000000e+01,   1.00000000e+00,   4.00000000e+00],
       [  3.60000000e+03,   2.00800000e+03,   6.90000000e+01, ...,
          3.10000000e+01,   3.00000000e+00,   4.00000000e+00],
       [  6.50000000e+02,   1.99500000e+03,   1.02000000e+02, ...,
          2.00000000e+00,   1.00000000e+00,   6.00000000e+00],
       ..., 
       [  1.19900000e+03,   2.00000000e+03,   1.01000000e+02, ...,
          3.20000000e+01,   1.00000000e+00,   2.00000000e+00],
       [  9.20000000e+03,   1.99600000e+03,   1.02000000e+02, ...,
          3.70000000e+01,   3.00000000e+00,   1.00000000e+00],
       [  2.89900000e+04,   2.01300000e+03,   3.20000000e+02, ...,
          2.00000000e+00,   1.00000000e+00,   6.00000000e+00]])

In [9]:
model = linear_model.LinearRegression()

# Percent of the X array to use as training set. This implies that the rest will be test set
test_size = .6

#Split into train and validation
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state = 3)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
model.fit(X_train, y_train)
# Explained variance score: 1 is perfect prediction
score = model.score(X_test, y_test)

((97555, 9), (146333, 9), (97555,), (146333,))




In [10]:
print(score)

0.58002844631


In [11]:
sample1 = [2010,200,20000,1,1,221,1,37,1,3]
sample2 = [2016.0,150.0,10000,1.0,1.0,221.0,2.0,3.0,6.0]
s_predict = model.predict([sample2])
print(s_predict)

[ 20249.86034974]


In [12]:
coreml_model = coremltools.converters.sklearn.convert(model, ["yearOfRegistration", "powerPS","kilometer", "gearbox","notRepairedDamage","model", "brand", "fuelType", "vehicleType"], "price")

In [13]:
coreml_model.save('/Users/shrav/Downloads/used-cars-database/usedcars.mlmodel')