In [0]:
!pip install --upgrade tables
!pip install eli5
!pip install xgboost

Collecting tables
[?25l  Downloading https://files.pythonhosted.org/packages/ed/c3/8fd9e3bb21872f9d69eb93b3014c86479864cca94e625fd03713ccacec80/tables-3.6.1-cp36-cp36m-manylinux1_x86_64.whl (4.3MB)
[K     |████████████████████████████████| 4.3MB 2.8MB/s 
Installing collected packages: tables
  Found existing installation: tables 3.4.4
    Uninstalling tables-3.4.4:
      Successfully uninstalled tables-3.4.4
Successfully installed tables-3.6.1
Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |████████████████████████████████| 112kB 2.7MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np

In [0]:
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [0]:
import xgboost as xgb

In [0]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

In [0]:
import eli5 

In [0]:
from eli5.sklearn import PermutationImportance

In [0]:
%cd /content/drive/My\ Drive/Colab\ Notebooks/matrix/matrix_two/dw_matrix_car

/content/drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car


In [0]:
df = pd.read_hdf("data/car.h5")
df.shape

(106494, 155)

## Features Engineering

In [0]:
SUFFIX_CAT = '_cat'
for feat in df.columns:
  # skip list columns
  if isinstance(df[feat][0], list): continue
  factorized_values = df[feat].factorize()[0]
  if SUFFIX_CAT in feat: # guard against multiply restarts of this notebook
    df[feat] = factorized_values
  else:
    df[feat + SUFFIX_CAT] = factorized_values

#cat_feats = [x for x in df.columns if SUFFIX_CAT in x and 'price' not in x]
cat_feats = [x for x in df.columns if SUFFIX_CAT in x]
cat_feats = [x for x in cat_feats if 'price' not in x]
len(cat_feats)

151

In [0]:
def run_model(model, feats):
  X = df[ feats].values
  y = df.price_value.values
  scores = cross_val_score(model, X, y, cv = 3, scoring='neg_mean_absolute_error') # runs fit and cross-validation
  return np.mean(scores), np.std(scores)


## Decision Tree

In [74]:

model = DecisionTreeRegressor(max_depth=5,random_state=0) # only one tree
run_model(model, cat_feats)

(-19747.81093847179, 75.95212671816309)

# Random Forrest

In [0]:
m2 = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0) # 50 trees
run_model(m2, cat_feats)

(-18864.27843495869, 38.24233578175403)

## XGBoost

In [0]:
xgb_params ={'max_depth':5, 'n_estimators':50, 'learning_rate':0.1, 'seed':0}
m3 = xgb.XGBRegressor(**xgb_params) 
run_model(m3, cat_feats)



(-13034.249470063352, 103.73566521784606)

In [0]:
m4 = xgb.XGBRegressor(**xgb_params) 
m4.fit(X,y)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=50,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=None,
             subsample=1, verbosity=1)

In [0]:
imp = PermutationImportance(m4, random_state=0).fit(X,y)
eli5.show_weights(imp, feature_names=cat_feats)

Weight,Feature
0.1274  ± 0.0017,param_napęd_cat
0.1159  ± 0.0025,param_stan_cat
0.1131  ± 0.0037,param_rok-produkcji_cat
0.0565  ± 0.0010,param_faktura-vat_cat
0.0482  ± 0.0014,param_moc_cat
0.0416  ± 0.0017,param_skrzynia-biegów_cat
0.0252  ± 0.0010,param_marka-pojazdu_cat
0.0247  ± 0.0014,feature_kamera-cofania_cat
0.0232  ± 0.0008,param_typ_cat
0.0198  ± 0.0006,param_pojemność-skokowa_cat


## Simplify model - use only most influential features

In [0]:
feats = ['param_stan_cat',
'param_rok-produkcji_cat',
'param_napęd_cat',
'param_faktura-vat_cat',
'param_moc_cat',
'param_skrzynia-biegów_cat',
'param_marka-pojazdu_cat',
'feature_kamera-cofania_cat',
'param_typ_cat',
'param_pojemność-skokowa_cat',
'seller_name_cat',
'param_wersja_cat',
'feature_wspomaganie-kierownicy_cat',
'param_model-pojazdu_cat',
'feature_system-start-stop_cat',
'param_kod-silnika_cat',
'feature_asystent-pasa-ruchu_cat',
'feature_łopatki-zmiany-biegów_cat',
'feature_światła-led_cat',
'feature_czujniki-parkowania-przednie_cat']

In [0]:
len(cat_feats),len(feats)

(151, 20)

In [0]:
run_model(xgb.XGBRegressor(**xgb_params), feats)



(-13406.842046925769, 47.84300012683049)

## Recover numerical information lost during categorization

In [40]:
df['param_rok-produkcji'].unique()

array(['2018', '2011', '2015', '2009', '2017', '2012', '2013', '2007',
       '2001', '2016', '2006', '2008', '2004', '1999', '2000', '2010',
       '2005', '2002', '1998', '2014', '2003', '1982', '1995', '1997',
       '1992', '1993', '1994', '1996', '1989', '1988', '1967', '1987',
       '1970', '1959', '1990', '1991', '1974', None, '1975', '1973',
       '1953', '1985', '1984', '1986', '1981', '1979', '1960', '1983',
       '1978', '1964', '1980', '1972', '1969', '1956', '1966', '1977',
       '1962', '1965', '1971', '1963', '1961', '1952', '1949', '1976',
       '1937', '1968', '1958', '1955', '1933', '1929', '1957', '1944',
       '1954', '1932', '1936', '1947', '1948'], dtype=object)

In [49]:
df['param_rok-produkcji'].map(lambda r: -1 if r is None else int(r)).unique()

array([2018, 2011, 2015, 2009, 2017, 2012, 2013, 2007, 2001, 2016, 2006,
       2008, 2004, 1999, 2000, 2010, 2005, 2002, 1998, 2014, 2003, 1982,
       1995, 1997, 1992, 1993, 1994, 1996, 1989, 1988, 1967, 1987, 1970,
       1959, 1990, 1991, 1974,   -1, 1975, 1973, 1953, 1985, 1984, 1986,
       1981, 1979, 1960, 1983, 1978, 1964, 1980, 1972, 1969, 1956, 1966,
       1977, 1962, 1965, 1971, 1963, 1961, 1952, 1949, 1976, 1937, 1968,
       1958, 1955, 1933, 1929, 1957, 1944, 1954, 1932, 1936, 1947, 1948])

In [0]:
df['param_rok-produkcji_NUM'] = df['param_rok-produkcji'].map(lambda r: -1 if r is None else int(r))

In [0]:
feats2 = ['param_stan_cat',
'param_rok-produkcji_NUM',
'param_napęd_cat',
'param_faktura-vat_cat',
'param_moc_cat',
'param_skrzynia-biegów_cat',
'param_marka-pojazdu_cat',
'feature_kamera-cofania_cat',
'param_typ_cat',
'param_pojemność-skokowa_cat',
'seller_name_cat',
'param_wersja_cat',
'feature_wspomaganie-kierownicy_cat',
'param_model-pojazdu_cat',
'feature_system-start-stop_cat',
'param_kod-silnika_cat',
'feature_asystent-pasa-ruchu_cat',
'feature_łopatki-zmiany-biegów_cat',
'feature_światła-led_cat',
'feature_czujniki-parkowania-przednie_cat']

In [52]:
run_model(xgb.XGBRegressor(**xgb_params), feats2)



(-11363.551396150078, 80.64673639548026)

In [55]:
df['param_moc'].value_counts()

150 KM        6405
140 KM        6275
90 KM         4419
110 KM        4371
105 KM        4275
              ... 
399 KM           1
2 018 KM         1
175 000 KM       1
1 540 KM         1
526 KM           1
Name: param_moc, Length: 558, dtype: int64

In [58]:
df['param_moc'].map(lambda x: -1 if x is None else int(x.split(' ')[0])).unique()

array([ 90, 115, 262, 110, 310, 105, 140, 175, 125, 185, 190, 440, 141,
       200, 224,  75,  99, 184, 109, 233, 116,  68, 286, 126, 160, 135,
       120, 272,  -1, 150, 180, 136, 102, 131, 218, 245, 170, 112, 250,
       252,  73, 100, 313, 101, 285,  70, 383, 174, 277, 132, 130, 215,
        60, 330, 163, 177,  98,  78, 189, 156, 143,  69, 113,  65, 122,
        82, 251,  95, 197, 235, 238, 171, 381, 400, 178,  80, 165,  85,
       258, 142, 204, 124,  55, 144, 231, 248, 152, 181, 210, 340, 129,
       147,  50,  54, 290, 306, 193,  77, 164,  96, 194, 111, 166, 206,
       118, 360, 211, 271, 455, 280, 106, 114, 421,  74, 213, 121, 275,
       435, 384, 326,  88, 220, 260,  64,  86, 128, 256, 240, 244, 162,
       237, 350,  35, 265, 202, 133,  83, 117, 146,  92, 192, 145, 525,
       254, 182, 328, 367, 148, 456,  97, 270, 107, 108, 203, 155,  94,
        93, 241,  20,  71, 173,  58, 205, 236,   1, 557,  84, 457,  72,
       295, 134, 425, 228,  81, 230, 201,  87, 234, 299, 585, 20

In [0]:
df['param_moc_NUM'] = df['param_moc'].map(lambda x: -1 if x is None else int(x.split(' ')[0]))

In [0]:
feats3 = ['param_stan_cat',
'param_rok-produkcji_NUM',
'param_napęd_cat',
'param_faktura-vat_cat',
'param_moc_NUM',
'param_skrzynia-biegów_cat',
'param_marka-pojazdu_cat',
'feature_kamera-cofania_cat',
'param_typ_cat',
'param_pojemność-skokowa_cat',
'seller_name_cat',
'param_wersja_cat',
'feature_wspomaganie-kierownicy_cat',
'param_model-pojazdu_cat',
'feature_system-start-stop_cat',
'param_kod-silnika_cat',
'feature_asystent-pasa-ruchu_cat',
'feature_łopatki-zmiany-biegów_cat',
'feature_światła-led_cat',
'feature_czujniki-parkowania-przednie_cat']

In [62]:
run_model(xgb.XGBRegressor(**xgb_params), feats3)



(-9725.119337009854, 83.59778127364514)

In [64]:
df['param_pojemność-skokowa'].unique()

array(['898 cm3', '1 560 cm3', '3 000 cm3', ..., '5 992 cm3', '1 966 cm3',
       '142 280 cm3'], dtype=object)

In [67]:
df['param_pojemność-skokowa'].map(lambda x: -1 if x is None else int(x.split('cm')[0].replace(' ',''))).unique()

array([   898,   1560,   3000, ...,   5992,   1966, 142280])

In [0]:
df['param_pojemność-skokowa_NUM'] = df['param_pojemność-skokowa'].map(lambda x: -1 if x is None else int(x.split('cm')[0].replace(' ','')))

In [0]:
feats4 = ['param_stan_cat',
'param_rok-produkcji_NUM',
'param_napęd_cat',
'param_faktura-vat_cat',
'param_moc_NUM',
'param_skrzynia-biegów_cat',
'param_marka-pojazdu_cat',
'feature_kamera-cofania_cat',
'param_typ_cat',
'param_pojemność-skokowa_NUM',
'seller_name_cat',
'param_wersja_cat',
'feature_wspomaganie-kierownicy_cat',
'param_model-pojazdu_cat',
'feature_system-start-stop_cat',
'param_kod-silnika_cat',
'feature_asystent-pasa-ruchu_cat',
'feature_łopatki-zmiany-biegów_cat',
'feature_światła-led_cat',
'feature_czujniki-parkowania-przednie_cat']

In [79]:
run_model(xgb.XGBRegressor(**xgb_params), feats4)



(-9570.177529662946, 86.42883374907932)

In [80]:
run_model(xgb.XGBRegressor(**xgb_params, objective="reg:squarederror"), feats4)

(-9570.177529662946, 86.42883374907932)