In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
df = pd.read_csv(r"D:\autos.csv", header=0, sep=',', encoding='Latin1',)

In [3]:
print(df.seller.value_counts())

privat        371525
gewerblich         3
Name: seller, dtype: int64


In [4]:
df[df.seller != 'gewerblich']
df=df.drop('seller',1)

In [5]:
print(df.offerType.value_counts())

Angebot    371516
Gesuch         12
Name: offerType, dtype: int64


In [6]:
df[df.offerType !='Gesuch']
df=df.drop('offerType',1)

In [7]:
print(df.shape)
df = df[(df.powerPS > 50) & (df.powerPS < 900)]
print(df.shape)
df = df[(df.yearOfRegistration >= 1950) & (df.yearOfRegistration < 2017)]
print(df.shape)

(371528, 18)
(319709, 18)
(309171, 18)


In [8]:
df.drop(['name','abtest','dateCrawled','nrOfPictures','lastSeen','postalCode','dateCreated'], axis='columns', inplace = True)

In [9]:
new_df = df.copy()
new_df = new_df.drop_duplicates(['price','vehicleType','yearOfRegistration','gearbox','powerPS','model','kilometer','monthOfRegistration','fuelType','notRepairedDamage'])

In [10]:
new_df.gearbox.replace(('manuell', 'automatik'), ('manual','automatic'), inplace=True)
new_df.fuelType.replace(('benzin','andere','elektro'),('petrol','others','electric'),inplace=True)
new_df.vehicleType.replace(('kleinwagen','cabrio','kombi','andere'),('snall car','convertible','combination','others'),inplace=True)
new_df.notRepairedDamage.replace(('ja','nein'),('Yes','No'),inplace=True)

In [11]:
new_df = new_df[(new_df.price >= 100) & (new_df.price <=150000)]

In [12]:
new_df['notRepairedDamage'].fillna(value='not-declared', inplace=True)
new_df['fuelType'].fillna(value='not-declared', inplace=True)
new_df['gearbox'].fillna(value='not-declared', inplace=True)
new_df['vehicleType'].fillna(value='not-declared', inplace=True)
new_df['model'].fillna(value='not-declared', inplace=True)

In [13]:
new_df.to_csv("autos_preprocessed.csv")

In [14]:
labels = ['gearbox','notRepairedDamage','model','brand','fuelType','vehicleType']

In [15]:
mapper = {}
for i in labels:
    mapper[i] = LabelEncoder()
    mapper[i].fit(new_df[i])
    tr = mapper[i].transform(new_df[i])
    np.save(str('classes'+i+'.npy'),mapper[i].classes_)
    print(i,":",mapper[i])
    new_df.loc[:,i + '_labels'] = pd.Series(tr, index=new_df.index)

gearbox : LabelEncoder()
notRepairedDamage : LabelEncoder()
model : LabelEncoder()
brand : LabelEncoder()
fuelType : LabelEncoder()
vehicleType : LabelEncoder()


In [16]:
labeled = new_df[['price','yearOfRegistration','powerPS','kilometer','monthOfRegistration'] + [x+"_labels" for x in labels]]
print(labeled.columns)

Index(['price', 'yearOfRegistration', 'powerPS', 'kilometer',
       'monthOfRegistration', 'gearbox_labels', 'notRepairedDamage_labels',
       'model_labels', 'brand_labels', 'fuelType_labels',
       'vehicleType_labels'],
      dtype='object')


In [17]:
Y = labeled.iloc[:,0].values
X = labeled.iloc[:,1:].values

In [18]:
Y = Y.reshape(-1,1)

In [19]:
from sklearn.model_selection import cross_val_score, train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 3)

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
regressor = RandomForestRegressor(n_estimators=1000,max_depth=10,random_state=34)

In [21]:
regressor.fit(X_train, np.ravel(Y_train,order='C'))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=34, verbose=0,
                      warm_start=False)

In [22]:
y_pred = regressor.predict(X_test)
print(r2_score(Y_test,y_pred))

0.834527626497731


In [23]:
filename = 'resale_model.sav'
pickle.dump(regressor, open(filename, 'wb'))

In [24]:
df.head()

Unnamed: 0,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
1,18300,coupe,2011,manuell,190,,125000,5,diesel,audi,ja
2,9800,suv,2004,automatik,163,grand,125000,8,diesel,jeep,
3,1500,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein
4,3600,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein
5,650,limousine,1995,manuell,102,3er,150000,10,benzin,bmw,ja


In [25]:
df.tail()

Unnamed: 0,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
371520,3200,limousine,2004,manuell,225,leon,150000,5,benzin,seat,ja
371524,1199,cabrio,2000,automatik,101,fortwo,125000,3,benzin,smart,nein
371525,9200,bus,1996,manuell,102,transporter,150000,3,diesel,volkswagen,nein
371526,3400,kombi,2002,manuell,100,golf,150000,6,diesel,volkswagen,
371527,28990,limousine,2013,manuell,320,m_reihe,50000,8,benzin,bmw,nein


In [26]:
df

Unnamed: 0,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
1,18300,coupe,2011,manuell,190,,125000,5,diesel,audi,ja
2,9800,suv,2004,automatik,163,grand,125000,8,diesel,jeep,
3,1500,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein
4,3600,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein
5,650,limousine,1995,manuell,102,3er,150000,10,benzin,bmw,ja
6,2200,cabrio,2004,manuell,109,2_reihe,150000,8,benzin,peugeot,nein
8,14500,bus,2014,manuell,125,c_max,30000,8,benzin,ford,
9,999,kleinwagen,1998,manuell,101,golf,150000,0,,volkswagen,
10,2000,limousine,2004,manuell,105,3_reihe,150000,12,benzin,mazda,nein
11,2799,kombi,2005,manuell,140,passat,150000,12,diesel,volkswagen,ja


In [27]:
new_df.head()

Unnamed: 0,price,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,gearbox_labels,notRepairedDamage_labels,model_labels,brand_labels,fuelType_labels,vehicleType_labels
1,18300,coupe,2011,manual,190,not-declared,125000,5,diesel,audi,Yes,1,1,162,1,1,3
2,9800,suv,2004,automatic,163,grand,125000,8,diesel,jeep,not-declared,0,2,118,14,1,8
3,1500,snall car,2001,manual,75,golf,150000,6,petrol,volkswagen,No,1,0,117,38,7,7
4,3600,snall car,2008,manual,69,fabia,90000,7,diesel,skoda,No,1,0,102,31,1,7
5,650,limousine,1995,manual,102,3er,150000,10,petrol,bmw,Yes,1,1,11,2,7,4
