# Modelling

## Loading the libraries

In [187]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
#import pandas_profiling as pp

# models
from sklearn.linear_model import LinearRegression, SGDRegressor, RidgeCV
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor 
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
import sklearn.model_selection
from sklearn.model_selection import cross_val_predict as cvp
from sklearn import metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
#import xgboost as xgb
#import lightgbm as lgb

# model tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval

import warnings
warnings.filterwarnings("ignore")

In [188]:
valid_part = 0.3
pd.set_option('max_columns',100)

## Reading the Data

In [189]:
train0 = pd.read_csv(r'D:/github/dataSets/dpa_project/cleanedDataForModelling.csv')

train0.head()

Unnamed: 0,id,url,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state,lat,long
0,7034441763,https://saltlakecity.craigslist.org/cto/d/salt...,salt lake city,17899,2012.0,volkswagen,golf r,excellent,4,gas,63500.0,clean,manual,4,hatchback,black,ut,40.76,-111.89
1,7034440588,https://saltlakecity.craigslist.org/ctd/d/sand...,salt lake city,46463,2015.0,gmc,sierra 1500,excellent,6,gas,7554.0,clean,automatic,4,truck,white,ut,40.76,-111.89
2,7034406932,https://saltlakecity.craigslist.org/ctd/d/evan...,salt lake city,49999,2018.0,ford,f-450,like new,6,diesel,70150.0,clean,automatic,4,pickup,white,ut,40.76,-111.89
3,7034406582,https://saltlakecity.craigslist.org/ctd/d/evan...,salt lake city,13999,2009.0,ram,unknown,good,6,gas,120057.0,clean,automatic,4,pickup,silver,ut,40.76,-111.89
4,7034405619,https://saltlakecity.craigslist.org/ctd/d/evan...,salt lake city,34500,2017.0,ford,f-350,excellent,6,diesel,95484.0,clean,automatic,4,pickup,white,ut,40.76,-111.89


In [131]:
#train0 = pd.read_csv(r'D:/Projects/DPA/Used-Cars-Analysis/data/vehicles_1.csv')

#train0.head()

In [190]:
train0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 446992 entries, 0 to 446991
Data columns (total 19 columns):
id              446992 non-null int64
url             446992 non-null object
region          446992 non-null object
price           446992 non-null int64
year            446992 non-null float64
manufacturer    446992 non-null object
model           446992 non-null object
condition       446992 non-null object
cylinders       446992 non-null int64
fuel            446992 non-null object
odometer        446992 non-null float64
title_status    446992 non-null object
transmission    446992 non-null object
drive           446992 non-null int64
type            446992 non-null object
paint_color     446992 non-null object
state           446992 non-null object
lat             446992 non-null float64
long            446992 non-null float64
dtypes: float64(4), int64(4), object(11)
memory usage: 64.8+ MB


### Specific Variables

In [221]:
drop_columns = ['id','url', 'region', 'model', 'title_status', 'state', 'lat','long']
train0 = train0.drop(columns = drop_columns)

In [222]:
null_values=train0.isnull().sum()
null_values=pd.DataFrame(null_values,columns=['null'])
j=1
sum_tot=len(train0)
null_values['percent']=null_values['null']/sum_tot
round(null_values*100,3).sort_values('percent',ascending=False)

Unnamed: 0,null,percent
price,0,0.0
year,0,0.0
manufacturer,0,0.0
condition,0,0.0
cylinders,0,0.0
fuel,0,0.0
odometer,0,0.0
transmission,0,0.0
drive,0,0.0
type,0,0.0


In [223]:
train0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 446992 entries, 0 to 446991
Data columns (total 11 columns):
price           446992 non-null int32
year            446992 non-null float16
manufacturer    446992 non-null int8
condition       446992 non-null int8
cylinders       446992 non-null int8
fuel            446992 non-null int8
odometer        446992 non-null float32
transmission    446992 non-null int8
drive           446992 non-null int8
type            446992 non-null int8
paint_color     446992 non-null int8
dtypes: float16(1), float32(1), int32(1), int8(8)
memory usage: 7.7 MB


### Reducing memory - helps for modelling the data

In [224]:
# Thanks to : https://www.kaggle.com/aantonova/some-new-risk-and-clusters-features
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [225]:
train0 = reduce_mem_usage(train0)

Mem. usage decreased to  7.67 Mb (0.0% reduction)


# AllVars + Label Encoding + Standarization + Modelling -> 1

# Label Encoding

In [226]:
# Encoding the categorical features
categorical_columns = train0.select_dtypes(exclude=['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns

for col in categorical_columns:
    if col in train0.columns:
        le = LabelEncoder()
        train0[[col]] = train0[[col]].apply(le.fit_transform)

In [210]:
train0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 446992 entries, 0 to 446991
Data columns (total 19 columns):
id              446992 non-null int64
url             446992 non-null int32
region          446992 non-null int16
price           446992 non-null int32
year            446992 non-null float16
manufacturer    446992 non-null int8
model           446992 non-null int16
condition       446992 non-null int8
cylinders       446992 non-null int8
fuel            446992 non-null int8
odometer        446992 non-null float32
title_status    446992 non-null int8
transmission    446992 non-null int8
drive           446992 non-null int8
type            446992 non-null int8
paint_color     446992 non-null int8
state           446992 non-null int8
lat             446992 non-null float16
long            446992 non-null float16
dtypes: float16(3), float32(1), int16(2), int32(2), int64(1), int8(10)
memory usage: 17.1 MB


In [211]:
train0.corr()

Unnamed: 0,id,url,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state,lat,long
id,1.0,-0.01,-0.03,-0.03,-0.02,0.02,-0.01,-0.01,-0.03,0.01,0.0,0.02,-0.01,-0.01,-0.01,-0.01,-0.05,-0.04,-0.09
url,-0.01,1.0,0.95,-0.02,-0.02,0.0,-0.0,-0.02,-0.0,0.01,0.02,-0.01,-0.01,-0.02,-0.0,0.0,-0.06,-0.05,-0.05
region,-0.03,0.95,1.0,-0.02,-0.02,0.0,-0.0,-0.02,0.0,0.01,0.02,-0.01,-0.01,-0.02,-0.0,0.0,-0.03,-0.06,-0.01
price,-0.03,-0.02,-0.02,1.0,0.42,0.0,-0.01,0.03,0.21,-0.21,-0.51,-0.06,0.03,0.21,0.01,0.0,0.01,0.03,-0.12
year,-0.02,-0.02,-0.02,0.42,1.0,-0.0,0.02,-0.0,-0.15,0.05,-0.44,0.02,-0.08,0.01,-0.02,-0.02,0.02,0.03,-0.03
manufacturer,0.02,0.0,0.0,0.0,-0.0,1.0,-0.11,0.01,-0.19,-0.05,-0.01,0.02,0.06,0.01,0.03,0.0,-0.01,-0.01,-0.03
model,-0.01,-0.0,-0.0,-0.01,0.02,-0.11,1.0,0.0,0.04,0.08,0.03,-0.01,-0.02,0.06,-0.1,-0.0,0.01,0.02,0.01
condition,-0.01,-0.02,-0.02,0.03,-0.0,0.01,0.0,1.0,0.0,0.01,-0.01,-0.0,0.02,0.12,0.09,0.1,0.01,0.02,-0.03
cylinders,-0.03,-0.0,0.0,0.21,-0.15,-0.19,0.04,0.0,1.0,-0.1,0.11,-0.07,-0.04,0.22,0.07,0.04,0.02,-0.0,-0.0
fuel,0.01,0.01,0.01,-0.21,0.05,-0.05,0.08,0.01,-0.1,1.0,-0.1,0.01,0.02,-0.11,-0.13,-0.05,-0.03,-0.01,0.02


In [212]:
# Thanks to : https://www.kaggle.com/aantonova/some-new-risk-and-clusters-features
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [213]:
train0 = reduce_mem_usage(train0)

Mem. usage decreased to 17.05 Mb (0.0% reduction)


In [214]:
train0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 446992 entries, 0 to 446991
Data columns (total 19 columns):
id              446992 non-null int64
url             446992 non-null int32
region          446992 non-null int16
price           446992 non-null int32
year            446992 non-null float16
manufacturer    446992 non-null int8
model           446992 non-null int16
condition       446992 non-null int8
cylinders       446992 non-null int8
fuel            446992 non-null int8
odometer        446992 non-null float32
title_status    446992 non-null int8
transmission    446992 non-null int8
drive           446992 non-null int8
type            446992 non-null int8
paint_color     446992 non-null int8
state           446992 non-null int8
lat             446992 non-null float16
long            446992 non-null float16
dtypes: float16(3), float32(1), int16(2), int32(2), int64(1), int8(10)
memory usage: 17.1 MB


In [215]:
train0.corr()

Unnamed: 0,id,url,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state,lat,long
id,1.0,-0.01,-0.03,-0.03,-0.02,0.02,-0.01,-0.01,-0.03,0.01,0.0,0.02,-0.01,-0.01,-0.01,-0.01,-0.05,-0.04,-0.09
url,-0.01,1.0,0.95,-0.02,-0.02,0.0,-0.0,-0.02,-0.0,0.01,0.02,-0.01,-0.01,-0.02,-0.0,0.0,-0.06,-0.05,-0.05
region,-0.03,0.95,1.0,-0.02,-0.02,0.0,-0.0,-0.02,0.0,0.01,0.02,-0.01,-0.01,-0.02,-0.0,0.0,-0.03,-0.06,-0.01
price,-0.03,-0.02,-0.02,1.0,0.42,0.0,-0.01,0.03,0.21,-0.21,-0.51,-0.06,0.03,0.21,0.01,0.0,0.01,0.03,-0.12
year,-0.02,-0.02,-0.02,0.42,1.0,-0.0,0.02,-0.0,-0.15,0.05,-0.44,0.02,-0.08,0.01,-0.02,-0.02,0.02,0.03,-0.03
manufacturer,0.02,0.0,0.0,0.0,-0.0,1.0,-0.11,0.01,-0.19,-0.05,-0.01,0.02,0.06,0.01,0.03,0.0,-0.01,-0.01,-0.03
model,-0.01,-0.0,-0.0,-0.01,0.02,-0.11,1.0,0.0,0.04,0.08,0.03,-0.01,-0.02,0.06,-0.1,-0.0,0.01,0.02,0.01
condition,-0.01,-0.02,-0.02,0.03,-0.0,0.01,0.0,1.0,0.0,0.01,-0.01,-0.0,0.02,0.12,0.09,0.1,0.01,0.02,-0.03
cylinders,-0.03,-0.0,0.0,0.21,-0.15,-0.19,0.04,0.0,1.0,-0.1,0.11,-0.07,-0.04,0.22,0.07,0.04,0.02,-0.0,-0.0
fuel,0.01,0.01,0.01,-0.21,0.05,-0.05,0.08,0.01,-0.1,1.0,-0.1,0.01,0.02,-0.11,-0.13,-0.05,-0.03,-0.01,0.02


In [216]:
#train0 = train0[train0['price'] > 1000]
#train0 = train0[train0['price'] < 40000]
# Rounded ['odometer'] to 5000
#train0['odometer'] = train0['odometer'] // 5000
#train0 = train0[train0['year'] > 110]

In [217]:
#train0.corr()

### Preparaing to the Modelling

In [218]:
y = train0['price']
X = train0.drop('price',axis=1)

#y = sample['price']
#X = sample.drop('price',axis=1)

#### Standarization

In [219]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

In [220]:
train, test, target, target_test = train_test_split(X, y, test_size=valid_part, random_state=0)

In [121]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#print (X_train.shape, y_train.shape)
#print (X_test.shape, y_test.shape)

(340887, 10) (340887,)
(85222, 10) (85222,)


#### Min-max scaler:


#### For each value in a feature, MinMaxScaler subtracts the minimum value in the feature and then divides by the range. The range is the difference between the original maximum and original minimum.

In [None]:
#from sklearn.preprocessing import MinMaxScaler
#sc = MinMaxScaler()   
#X_train= sc.fit_transform(X_train)
#X_test= sc.transform(X_test)

In [19]:
#from sklearn.preprocessing import MinMaxScaler
#sc = MinMaxScaler()   
#X_train= sc.fit_transform(train)
#X_test= sc.transform(test)

In [167]:
acc_train_r2 = []
acc_test_r2 = []
acc_train_d = []
acc_test_d = []
acc_train_rmse = []
acc_test_rmse = []

In [168]:
def acc_d(y_meas, y_pred):
    # Relative error between predicted y_pred and measured y_meas values
    return mean_absolute_error(y_meas, y_pred)*len(y_meas)/sum(abs(y_meas))

def acc_rmse(y_meas, y_pred):
    # RMSE between predicted y_pred and measured y_meas values
    return (mean_squared_error(y_meas, y_pred))**0.5

In [169]:
def acc_model(num,model,train,test):
    # Calculation of accuracy of model акщь Sklearn by different metrics   
  
    global acc_train_r2, acc_test_r2, acc_train_d, acc_test_d, acc_train_rmse, acc_test_rmse
    
    ytrain = model.predict(train)  
    ytest = model.predict(test)

    print('target = ', target[:5].values)
    print('ytrain = ', ytrain[:5])

    acc_train_r2_num = round(r2_score(target, ytrain) * 100, 2)
    print('acc(r2_score) for train =', acc_train_r2_num)   
    acc_train_r2.insert(num, acc_train_r2_num)

    acc_train_d_num = round(acc_d(target, ytrain) * 100, 2)
    print('acc(relative error) for train =', acc_train_d_num)   
    acc_train_d.insert(num, acc_train_d_num)

    acc_train_rmse_num = round(acc_rmse(target, ytrain) * 100, 2)
    print('acc(rmse) for train =', acc_train_rmse_num)   
    acc_train_rmse.insert(num, acc_train_rmse_num)
    
    #print('Accuracy for train',model.score(target, ytrain))


    print('target_test =', target_test[:5].values)
    print('ytest =', ytest[:5])
    
    acc_test_r2_num = round(r2_score(target_test, ytest) * 100, 2)
    print('acc(r2_score) for test =', acc_test_r2_num)
    acc_test_r2.insert(num, acc_test_r2_num)
    
    acc_test_d_num = round(acc_d(target_test, ytest) * 100, 2)
    print('acc(relative error) for test =', acc_test_d_num)
    acc_test_d.insert(num, acc_test_d_num)
    
    acc_test_rmse_num = round(acc_rmse(target_test, ytest) * 100, 2)
    print('acc(rmse) for test =', acc_test_rmse_num)
    acc_test_rmse.insert(num, acc_test_rmse_num)
    
    #print('Accuracy for test',model.score(target_test, ytest))


In [170]:
# Linear Regression
linreg = LinearRegression()
linreg.fit(train, target)
acc_model(0,linreg,train,test)

target =  [25480 15000  8995 12888 29292]
ytrain =  [13173.69801108 -2185.91509221 12660.98713476 16183.27599374
 22655.07013151]
acc(r2_score) for train = 49.37
acc(relative error) for train = 39.03
acc(rmse) for train = 804838.58
target_test = [ 3525 10995  8900 35430  2500]
ytest = [ 5200.05195106 12777.17628823 11129.28481128 20727.27988609
 15377.50812151]
acc(r2_score) for test = 49.52
acc(relative error) for test = 38.87
acc(rmse) for test = 797064.79


In [None]:
# Support Vector Machines
svr = SVR()
svr.fit(train, target)
acc_model(1,svr,train,test)

In [171]:
# Linear SVR

linear_svr = LinearSVR()
linear_svr.fit(train, target)
acc_model(1,linear_svr,train,test)

target =  [25480 15000  8995 12888 29292]
ytrain =  [12023.34628596 -8321.47727269 12246.3232633  14264.18093458
 20281.40348459]
acc(r2_score) for train = 45.44
acc(relative error) for train = 37.93
acc(rmse) for train = 835525.07
target_test = [ 3525 10995  8900 35430  2500]
ytest = [ 4174.27278253 11495.61370195  3909.54569907 18981.642424
 13927.92916905]
acc(r2_score) for test = 45.76
acc(relative error) for test = 37.74
acc(rmse) for test = 826237.63


In [None]:

mlp = MLPRegressor()
param_grid = {'hidden_layer_sizes': [i for i in range(2,20)],
              'activation': ['relu'],
              'solver': ['adam'],
              'learning_rate': ['constant'],
              'learning_rate_init': [0.01],
              'power_t': [0.5],
              'alpha': [0.0001],
              'max_iter': [1000],
              'early_stopping': [True],
              'warm_start': [False]}
mlp_GS = GridSearchCV(mlp, param_grid=param_grid, 
                   cv=10, verbose=True, pre_dispatch='2*n_jobs')
mlp_GS.fit(train, target)
acc_model(3,mlp_GS,train,test)

In [172]:
random_forest = RandomForestRegressor()
random_forest.fit(train, target)
acc_model(2,random_forest,train,test)

target =  [25480 15000  8995 12888 29292]
ytrain =  [24010.8 10315.  10142.9 14829.  25279.3]
acc(r2_score) for train = 96.97
acc(relative error) for train = 6.91
acc(rmse) for train = 196987.02
target_test = [ 3525 10995  8900 35430  2500]
ytest = [ 3519.5  9289.4 18179.5 33998.2  3047.7]
acc(r2_score) for test = 83.41
acc(relative error) for test = 17.14
acc(rmse) for test = 456954.54


In [173]:
sgd = SGDRegressor()
sgd.fit(train, target)
acc_model(3,sgd,train,test)

target =  [25480 15000  8995 12888 29292]
ytrain =  [12820.05742004 -1338.40412443 12785.40248047 16783.37398361
 22457.1605805 ]
acc(r2_score) for train = 49.18
acc(relative error) for train = 39.1
acc(rmse) for train = 806388.67
target_test = [ 3525 10995  8900 35430  2500]
ytest = [ 5513.59645014 13465.82791886 11852.05751534 20682.36371416
 14735.17788092]
acc(r2_score) for test = 49.31
acc(relative error) for test = 38.94
acc(rmse) for test = 798727.61


In [86]:
# Ridge Regressor
ridge = RidgeCV(cv=5)
ridge.fit(train, target)
acc_model(4,ridge,train,test)

target =  [25480 15000  8995 12888 29292]
ytrain =  [13674.16228462 -3441.26413823 10973.74844973 15415.16307964
 22627.52219685]
acc(r2_score) for train = 48.19
acc(relative error) for train = 39.68
acc(rmse) for train = 814138.17
target_test = [ 3525 10995  8900 35430  2500]
ytest = [ 5034.52338764 12153.95848961  9543.88924709 19422.50251772
 15020.18632168]
acc(r2_score) for test = 48.37
acc(relative error) for test = 39.49
acc(rmse) for test = 806125.24


In [174]:
# Bagging Regressor

bagging = BaggingRegressor()
bagging.fit(train, target)
acc_model(5,bagging,train,test)

target =  [25480 15000  8995 12888 29292]
ytrain =  [22976.  10699.9 10115.9 12169.7 24973.3]
acc(r2_score) for train = 97.02
acc(relative error) for train = 6.93
acc(rmse) for train = 195392.78
target_test = [ 3525 10995  8900 35430  2500]
ytest = [ 3548.2 10210.3 10634.5 30462.3  2250. ]
acc(r2_score) for test = 83.36
acc(relative error) for test = 17.1
acc(rmse) for test = 457641.02


In [175]:
# Extra Trees Regressor

etr = ExtraTreesRegressor()
etr.fit(train, target)
acc_model(6,etr,train,test)

target =  [25480 15000  8995 12888 29292]
ytrain =  [25480. 15000.  8995. 12888. 29292.]
acc(r2_score) for train = 100.0
acc(relative error) for train = 0.0
acc(rmse) for train = 210.55
target_test = [ 3525 10995  8900 35430  2500]
ytest = [ 3477.5  9715.2  7989.5 34277.   2500. ]
acc(r2_score) for test = 83.9
acc(relative error) for test = 16.14
acc(rmse) for test = 450075.36


In [176]:
# AdaBoost Regression

Ada_Boost = AdaBoostRegressor()
Ada_Boost.fit(train, target)
acc_model(7,Ada_Boost,train,test)

target =  [25480 15000  8995 12888 29292]
ytrain =  [23027.93294341 48711.28215406 20802.4418648  20802.4418648
 34124.5735186 ]
acc(r2_score) for train = -31.03
acc(relative error) for train = 79.33
acc(rmse) for train = 1294762.27
target_test = [ 3525 10995  8900 35430  2500]
ytest = [20802.4418648  20802.4418648  24467.09166184 23027.93294341
 23027.93294341]
acc(r2_score) for test = -32.57
acc(relative error) for test = 79.44
acc(rmse) for test = 1291680.38


In [184]:
len(['Linear Regression', 'Linear SVR', 
              'Random Forest','Stochastic Gradient Decent',
               'RidgeRegressor', 'BaggingRegressor', 'ExtraTreesRegressor', 
              'AdaBoostRegressor'])

8

In [186]:
len(acc_train_r2)

7

In [180]:
models = pd.DataFrame({
    'Model': ['Linear Regression', 'Linear SVR', 
              'Random Forest','Stochastic Gradient Decent',
               'RidgeRegressor', 'BaggingRegressor', 'ExtraTreesRegressor', 
              'AdaBoostRegressor'],
    
    'r2_train': acc_train_r2,
    'r2_test': acc_test_r2,
    'd_train': acc_train_d,
    'd_test': acc_test_d,
    'rmse_train': acc_train_rmse,
    'rmse_test': acc_test_rmse
                     })


ValueError: arrays must all be same length

In [None]:
models = models[models.Model != 'Stochastic Gradient Decent']

In [None]:
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
print('Prediction accuracy for models by R2 criterion - r2_test')
models.sort_values(by=['r2_test', 'r2_train'], ascending=False)

In [None]:
print('Prediction accuracy for models by relative error - d_test')
models.sort_values(by=['d_test', 'd_train'], ascending=True)

In [None]:
print('Prediction accuracy for models by RMSE - rmse_test')
models.sort_values(by=['rmse_test', 'rmse_train'], ascending=True)

In [None]:
# Plot
plt.figure(figsize=[25,6])
xx = models['Model']
plt.tick_params(labelsize=14)
plt.plot(xx, models['r2_train'], label = 'r2_train')
plt.plot(xx, models['r2_test'], label = 'r2_test')
plt.legend()
plt.title('R2-criterion for 15 popular models for train and test datasets')
plt.xlabel('Models')
plt.ylabel('R2-criterion, %')
plt.xticks(xx, rotation='vertical')
plt.savefig('graph.png')
plt.show()

In [None]:
# Plot
plt.figure(figsize=[25,6])
xx = models['Model']
plt.tick_params(labelsize=14)
plt.plot(xx, models['d_train'], label = 'd_train')
plt.plot(xx, models['d_test'], label = 'd_test')
plt.legend()
plt.title('Relative errors for 15 popular models for train and test datasets')
plt.xlabel('Models')
plt.ylabel('Relative error, %')
plt.xticks(xx, rotation='vertical')
plt.savefig('graph.png')
plt.show()

In [None]:
# Plot
plt.figure(figsize=[25,6])
xx = models['Model']
plt.tick_params(labelsize=14)
plt.plot(xx, models['rmse_train'], label = 'rmse_train')
plt.plot(xx, models['rmse_test'], label = 'rmse_test')
plt.legend()
plt.title('RMSE for 15 popular models for train and test datasets')
plt.xlabel('Models')
plt.ylabel('RMSE, %')
plt.xticks(xx, rotation='vertical')
plt.savefig('graph.png')
plt.show()

### Efficient Categorical encoding

In [111]:
train0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 446992 entries, 0 to 446991
Data columns (total 19 columns):
id              446992 non-null int64
url             446992 non-null object
region          446992 non-null object
price           446992 non-null int64
year            446992 non-null float64
manufacturer    446992 non-null object
model           446992 non-null object
condition       446992 non-null object
cylinders       446992 non-null int64
fuel            446992 non-null object
odometer        446992 non-null float64
title_status    446992 non-null object
transmission    446992 non-null object
drive           446992 non-null int64
type            446992 non-null object
paint_color     446992 non-null object
state           446992 non-null object
lat             446992 non-null float64
long            446992 non-null float64
dtypes: float64(4), int64(4), object(11)
memory usage: 64.8+ MB
