In [3]:
!pip install --upgrade tables
!pip install eli5

Collecting tables
[?25l  Downloading https://files.pythonhosted.org/packages/ed/c3/8fd9e3bb21872f9d69eb93b3014c86479864cca94e625fd03713ccacec80/tables-3.6.1-cp36-cp36m-manylinux1_x86_64.whl (4.3MB)
[K     |████████████████████████████████| 4.3MB 3.0MB/s 
Installing collected packages: tables
  Found existing installation: tables 3.4.4
    Uninstalling tables-3.4.4:
      Successfully uninstalled tables-3.4.4
Successfully installed tables-3.6.1
Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |████████████████████████████████| 112kB 3.5MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [0]:
import pandas as pd
import numpy as np

from sklearn.dummy import  DummyRegressor
from sklearn.tree import  DecisionTreeRegressor

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

## Read data

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [39]:
%pwd
%cd "/content/drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car" 
%ls

/content/drive/My Drive/Colab Notebooks/matrix/matrix_two/dw_matrix_car
[0m[01;34mdata[0m/  day2_visualisation.ipynb  LICENSE  README.md


In [41]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [42]:
df.columns

Index(['breadcrumb', 'created_at', 'price_currency', 'price_details',
       'price_value', 'seller_address', 'seller_name', 'seller_type',
       'feature_czujniki-parkowania-przednie',
       'feature_poduszka-powietrzna-chroniąca-kolana',
       ...
       'param_pearl', 'param_stan', 'param_wersja', 'param_emisja-co2',
       'param_body-type', 'param_matowy', 'param_bezwypadkowy',
       'param_akryl-(niemetalizowany)', 'param_monthly-payment-value',
       'car_id'],
      dtype='object', length=155)

## Dummy Model

In [43]:
df.select_dtypes(np.number).columns

Index(['price_value', 'car_id'], dtype='object')

In [45]:
feats = ['car_id']
X = df[ feats ].values
y = df[ 'price_value' ].values

model = DummyRegressor()
model.fit(X,y)
y_pred = model.predict(X)

mae(y, y_pred)

39465.934630440985

In [46]:
[x for x in df.columns if 'price' in x]

['price_currency', 'price_details', 'price_value']

In [49]:
df.price_currency.value_counts()

PLN    106290
EUR       204
Name: price_currency, dtype: int64

In [51]:
# distribution in %
df.price_currency.value_counts(normalize=True) * 100

PLN    99.80844
EUR     0.19156
Name: price_currency, dtype: float64

In [52]:
# ignore EUR
df = df [ df.price_currency != 'EUR'  ] 
df.shape

(106290, 155)

## Features

In [59]:
SUFFIX_CAT = '_cat'
for feat in df.columns:
  # skip list columns
  if isinstance(df[feat][0], list): 
    continue
  factorized_values = df[feat].factorize()[0]
  if SUFFIX_CAT in feat: # guard against multiply restarts of this notebook
    df[feat] = factorized_values
  else:
    df[feat + SUFFIX_CAT] = factorized_values

cat_feats = [x for x in df.columns if SUFFIX_CAT in x and 'price' not in x]
cat_feats, len(cat_feats)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(['created_at_cat',
  'seller_address_cat',
  'seller_name_cat',
  'seller_type_cat',
  'feature_czujniki-parkowania-przednie_cat',
  'feature_poduszka-powietrzna-chroniąca-kolana_cat',
  'feature_kurtyny-powietrzne_cat',
  'feature_klimatyzacja-dwustrefowa_cat',
  'feature_światła-led_cat',
  'feature_czujnik-zmierzchu_cat',
  'feature_elektrycznie-ustawiane-lusterka_cat',
  'feature_asr-(kontrola-trakcji)_cat',
  'feature_poduszka-powietrzna-kierowcy_cat',
  'feature_cd_cat',
  'feature_elektryczne-szyby-przednie_cat',
  'feature_poduszka-powietrzna-pasażera_cat',
  'feature_system-start-stop_cat',
  'feature_światła-do-jazdy-dziennej_cat',
  'feature_komputer-pokładowy_cat',
  'feature_elektryczne-szyby-tylne_cat',
  'feature_klimatyzacja-manualna_cat',
  'feature_tapicerka-welurowa_cat',
  'feature_czujnik-deszczu_cat',
  'feature_światła-przeciwmgielne_cat',
  'feature_ogrzewanie-postojowe_cat',
  'feature_radio-niefabryczne_cat',
  'feature_regulowane-zawieszenie_cat',
  'feature

In [60]:
X = df[ cat_feats].values
y = df.price_value.values

model = DecisionTreeRegressor(max_depth=5)
scores = cross_val_score(model, X, y, cv = 3, scoring='neg_mean_absolute_error') # runs fit and cross-validation
np.mean(scores)

-19650.45289201444

## Which features dominate?

In [64]:
m2 = DecisionTreeRegressor(max_depth=5)
m2.fit(X,y)
imp = PermutationImportance(m2, random_state=0).fit(X,y)
eli5.show_weights(imp, feature_names=cat_feats)

Weight,Feature
0.1956  ± 0.0054,param_faktura-vat_cat
0.1903  ± 0.0027,param_napęd_cat
0.1815  ± 0.0087,param_stan_cat
0.1512  ± 0.0078,param_rok-produkcji_cat
0.0928  ± 0.0019,param_skrzynia-biegów_cat
0.0607  ± 0.0051,param_moc_cat
0.0436  ± 0.0015,feature_kamera-cofania_cat
0.0191  ± 0.0022,param_pojemność-skokowa_cat
0.0162  ± 0.0004,feature_bluetooth_cat
0.0110  ± 0.0007,feature_łopatki-zmiany-biegów_cat


In [87]:
df["param_napęd"].value_counts(dropna=False, normalize=True)

Na przednie koła                 0.620246
NaN                              0.153759
Na tylne koła                    0.080807
4x4 (stały)                      0.065340
4x4 (dołączany automatycznie)    0.064531
4x4 (dołączany ręcznie)          0.015317
Name: param_napęd, dtype: float64

In [88]:
df["param_faktura-vat"].value_counts(dropna=False, normalize=True) # bez faktury => tanie ???

NaN    0.752131
Tak    0.247869
Name: param_faktura-vat, dtype: float64

In [92]:
df.param_stan.value_counts(dropna=False, normalize=True)

Używane    0.909982
Nowe       0.088992
NaN        0.001025
Name: param_stan, dtype: float64

In [93]:
df['param_rok-produkcji'].value_counts(dropna=False, normalize=True)

2017    0.110424
2007    0.070411
2008    0.068661
2006    0.065246
2009    0.058670
          ...   
1936    0.000009
1957    0.000009
1944    0.000009
1932    0.000009
1933    0.000009
Name: param_rok-produkcji, Length: 77, dtype: float64

In [80]:
df.index.size # of Rows

106290