In [3]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.preprocessing import LabelEncoder
import pickle

In [4]:
df = pd.read_csv(r"F:\New folder\autos.csv", header=0, sep=',', encoding='Latin1',)


In [5]:
print(df.seller.value_counts())

privat        371525
gewerblich         3
Name: seller, dtype: int64


In [6]:
df[df.seller != 'gewerblich']
df=df.drop('seller',1)

In [7]:
print(df.offerType.value_counts())

Angebot    371516
Gesuch         12
Name: offerType, dtype: int64


In [8]:
df[df.offerType !='Gesuch']
df=df.drop('offerType',1)

In [9]:
print(df.shape)
df = df[(df.powerPS > 50) & (df.powerPS < 900)]
print(df.shape)
df = df[(df.yearOfRegistration >= 1950) & (df.yearOfRegistration < 2017)]
print(df.shape)


(371528, 18)
(319709, 18)
(309171, 18)


In [10]:
df.drop(['name','abtest','dateCrawled','nrOfPictures','lastSeen','postalCode','dateCreated'], axis='columns', inplace = True)

In [11]:
new_df = df.copy()
new_df = new_df.drop_duplicates(['price','vehicleType','yearOfRegistration','gearbox','powerPS','model','kilometer','monthOfRegistration','fuelType','notRepairedDamage'])

In [12]:
new_df.gearbox.replace(('manuell', 'automatik'), ('manual','automatic'), inplace=True)
new_df.fuelType.replace(('benzin','andere','elektro'),('petrol','others','electric'),inplace=True)
new_df.vehicleType.replace(('kleinwagen','cabrio','kombi','andere'),('snall car','convertible','combination','others'),inplace=True)
new_df.notRepairedDamage.replace(('ja','nein'),('Yes','No'),inplace=True)


In [13]:
new_df = new_df[(new_df.price >= 100) & (new_df.price <=150000)]


In [14]:
new_df['notRepairedDamage'].fillna(value='not-declared', inplace=True)
new_df['fuelType'].fillna(value='not-declared', inplace=True)
new_df['gearbox'].fillna(value='not-declared', inplace=True)
new_df['vehicleType'].fillna(value='not-declared', inplace=True)
new_df['model'].fillna(value='not-declared', inplace=True)


In [15]:
new_df.to_csv("autos_preprocessed.csv")


In [16]:
labels = ['gearbox','notRepairedDamage','model','brand','fuelType','vehicleType']


In [17]:
mapper = {}
for i in labels:
    mapper[i] = LabelEncoder()
    mapper[i].fit(new_df[i])
    tr = mapper[i].transform(new_df[i])
    np.save(str('classes'+i+'.npy'),mapper[i].classes_)
    print(i,":",mapper[i])
    new_df.loc[:,i + '_labels'] = pd.Series(tr, index=new_df.index)

gearbox : LabelEncoder()
notRepairedDamage : LabelEncoder()
model : LabelEncoder()
brand : LabelEncoder()
fuelType : LabelEncoder()
vehicleType : LabelEncoder()


In [18]:
labeled = new_df[['price','yearOfRegistration','powerPS','kilometer','monthOfRegistration'] + [x+"_labels" for x in labels]]
print(labeled.columns)

Index(['price', 'yearOfRegistration', 'powerPS', 'kilometer',
       'monthOfRegistration', 'gearbox_labels', 'notRepairedDamage_labels',
       'model_labels', 'brand_labels', 'fuelType_labels',
       'vehicleType_labels'],
      dtype='object')


In [19]:
Y = labeled.iloc[:,0].values
X = labeled.iloc[:,1:].values

In [20]:
Y = Y.reshape(-1,1)

In [21]:
from sklearn.model_selection import cross_val_score, train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 3)

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
regressor = RandomForestRegressor(n_estimators=1000,max_depth=10,random_state=34)

In [23]:
regressor.fit(X_train, np.ravel(Y_train,order='C'))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=34, verbose=0,
                      warm_start=False)

In [24]:
y_pred = regressor.predict(X_test)
print(r2_score(Y_test,y_pred))

0.834527626497731


In [25]:
filename = 'resale_model.sav'
pickle.dump(regressor, open(filename, 'wb'))