In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
df = pd.read_csv("autos.csv", header=0, sep=',', encoding='Latin1',)
df.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [3]:
print(df.seller.value_counts())

privat        371525
gewerblich         3
Name: seller, dtype: int64


In [4]:
df[df.seller != 'gewerblich']
df=df.drop('seller',1)

In [5]:
print(df.offerType.value_counts())

Angebot    371516
Gesuch         12
Name: offerType, dtype: int64


In [6]:
df[df.offerType !='Gesuch']
df=df.drop('offerType',1)

In [7]:
print(df.shape)
df = df[(df.powerPS > 50) & (df.powerPS < 900)]
print(df.shape)
df = df[(df.yearOfRegistration >= 1950) & (df.yearOfRegistration < 2017)]
print(df.shape)

(371528, 18)
(319709, 18)
(309171, 18)


In [8]:
df.drop(['name','abtest','dateCrawled','nrOfPictures','lastSeen','postalCode','dateCreated'], axis='columns', inplace = True)

In [9]:
new_df = df.copy()
new_df = new_df.drop_duplicates(['price','vehicleType','yearOfRegistration','gearbox','powerPS','model','kilometer','monthOfRegistration','fuelType','notRepairedDamage'])

In [10]:
new_df.gearbox.replace(('manuell', 'automatik'), ('manual','automatic'), inplace=True)
new_df.fuelType.replace(('benzin','andere','elektro'),('petrol','others','electric'),inplace=True)
new_df.vehicleType.replace(('kleinwagen','cabrio','kombi','andere'),('snall car','convertible','combination','others'),inplace=True)
new_df.notRepairedDamage.replace(('ja','nein'),('Yes','No'),inplace=True)

In [11]:
new_df = new_df[(new_df.price >= 100) & (new_df.price <=150000)]

In [12]:
new_df['notRepairedDamage'].fillna(value='not-declared', inplace=True)
new_df['fuelType'].fillna(value='not-declared', inplace=True)
new_df['gearbox'].fillna(value='not-declared', inplace=True)
new_df['vehicleType'].fillna(value='not-declared', inplace=True)
new_df['model'].fillna(value='not-declared', inplace=True)

In [13]:
new_df.to_csv("autos_preprocessed.csv")

In [14]:
labels = ['gearbox','notRepairedDamage','model','brand','fuelType','vehicleType']

In [15]:
mapper = {}
for i in labels:
    mapper[i] = LabelEncoder()
    mapper[i].fit(new_df[i])
    tr = mapper[i].transform(new_df[i])
    np.save(str('classes'+i+'.npy'),mapper[i].classes_)
    print(i,":",mapper[i])
    new_df.loc[:,i + '_labels'] = pd.Series(tr, index=new_df.index)


gearbox : LabelEncoder()
notRepairedDamage : LabelEncoder()
model : LabelEncoder()
brand : LabelEncoder()
fuelType : LabelEncoder()
vehicleType : LabelEncoder()


In [16]:
labeled = new_df[['price','yearOfRegistration','powerPS','kilometer','monthOfRegistration'] + [x+"_labels" for x in labels]]
print(labeled.columns)

Index(['price', 'yearOfRegistration', 'powerPS', 'kilometer',
       'monthOfRegistration', 'gearbox_labels', 'notRepairedDamage_labels',
       'model_labels', 'brand_labels', 'fuelType_labels',
       'vehicleType_labels'],
      dtype='object')


In [17]:
Y = labeled.iloc[:,0].values
X = labeled.iloc[:,1:].values

In [18]:
X

array([[  2011,    190, 125000, ...,      1,      1,      3],
       [  2004,    163, 125000, ...,     14,      1,      8],
       [  2001,     75, 150000, ...,     38,      7,      7],
       ...,
       [  1996,    102, 150000, ...,     38,      1,      0],
       [  2002,    100, 150000, ...,     38,      1,      1],
       [  2013,    320,  50000, ...,      2,      7,      4]], dtype=int64)

In [19]:
Y

array([18300,  9800,  1500, ...,  9200,  3400, 28990], dtype=int64)

In [20]:
Y = Y.reshape(-1,1)

In [21]:
from sklearn.model_selection import cross_val_score, train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 3)

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
regressor = RandomForestRegressor(n_estimators=1000,max_depth=10,random_state=34)

In [23]:
regressor.fit(X_train, np.ravel(Y_train,order='C'))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=34, verbose=0,
                      warm_start=False)

In [24]:
y_pred = regressor.predict(X_test)
print(r2_score(Y_test,y_pred))

0.834527626497731


In [25]:
filename = 'resale_model.sav'
pickle.dump(regressor, open(filename, 'wb'))