In [2]:
%pylab inline
import pandas as pd
import numpy as np

Populating the interactive namespace from numpy and matplotlib


In [106]:
transactions = pd.read_csv('train.csv')
macro = pd.read_csv('macro.csv')
test = pd.read_csv('test.csv')

In [107]:
train = pd.merge(transactions, macro, on='timestamp', how='left')

In [108]:
test_all = pd.merge(test, macro, on='timestamp', how='left')

In [109]:
test_all.shape, train.shape

((7662, 390), (30471, 391))

## Preparing the data: categorical variables, gap filling

In [110]:
categorical = train.select_dtypes(include=['object'])
train = train.select_dtypes(exclude=['object'])
train['timestamp'] = categorical['timestamp']
categorical.drop('timestamp', axis=1, inplace=True)
categorical.replace(to_replace='#!', value=NaN, inplace=True)

In [111]:
for c in categorical.columns:
    print(c, categorical[c].unique())

product_type ['Investment' 'OwnerOccupier']
sub_area ['Bibirevo' 'Nagatinskij Zaton' "Tekstil'shhiki" 'Mitino' 'Basmannoe'
 'Nizhegorodskoe' "Sokol'niki" 'Koptevo' 'Kuncevo' 'Kosino-Uhtomskoe'
 'Zapadnoe Degunino' 'Presnenskoe' 'Lefortovo' "Mar'ino" "Kuz'minki"
 'Nagornoe' "Gol'janovo" 'Vnukovo' 'Juzhnoe Tushino' 'Severnoe Tushino'
 "Chertanovo Central'noe" 'Fili Davydkovo' 'Otradnoe' 'Novo-Peredelkino'
 'Bogorodskoe' 'Jaroslavskoe' 'Strogino' 'Hovrino' "Moskvorech'e-Saburovo"
 'Staroe Krjukovo' 'Ljublino' 'Caricyno' 'Veshnjaki' 'Danilovskoe'
 'Preobrazhenskoe' "Kon'kovo" 'Brateevo' 'Vostochnoe Izmajlovo'
 'Vyhino-Zhulebino' 'Donskoe' 'Novogireevo' 'Juzhnoe Butovo' 'Sokol'
 'Kurkino' 'Izmajlovo' 'Severnoe Medvedkovo' 'Rostokino'
 'Orehovo-Borisovo Severnoe' 'Ochakovo-Matveevskoe' 'Taganskoe'
 'Dmitrovskoe' 'Orehovo-Borisovo Juzhnoe' 'Teplyj Stan' 'Babushkinskoe'
 'Pokrovskoe Streshnevo' 'Obruchevskoe' 'Filevskij Park'
 'Troparevo-Nikulino' 'Severnoe Butovo' 'Hamovniki' 'Solncevo'
 'Dor

0) Some categorical variables could be quantitative (child_on_acc_pre_school, modern_education_share, old_education_build_share)

In [112]:
import locale
from locale import atof
locale.setlocale(locale.LC_NUMERIC, 'de_DE')

'de_DE'

In [113]:
cs = ['child_on_acc_pre_school', 'modern_education_share', 'old_education_build_share']
new_cat = categorical[cs].applymap(lambda x: NaN if pd.isnull(x) else locale.atof(x))
for c in cs:
    train[c] = new_cat[c]
    categorical.drop(c, axis=1, inplace=True)

1) Convert categorical variables to dummy features

Cool TODO: convert areas labels to actual coordinates

In [114]:
for c in categorical.columns:
    dummies = pd.get_dummies(categorical[c])
    for col in dummies.columns:
        train[c + '_' + col] = dummies[col]

2) Convert timestamp to unix time

In [122]:
train['timestamp'] = pd.to_datetime(train['timestamp']).astype(np.int64) // 10**9

3) Replace NaNs with -1

In [115]:
train.replace(to_replace=NaN, value=-1, inplace=True)

## Feature selection

In [116]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [117]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

def classification_metrics(y_test, y_pred):
    funcs = dict(
        accuracy_score=accuracy_score, 
        precision_score=precision_score, 
        recall_score=recall_score,
        f1_score=f1_score,
        roc_auc_score=roc_auc_score,
    )
    result = {k: v(y_test, y_pred) for k, v in funcs.items()}
    for k, v in result.items():
        print(k.title(), v)
    return result

In [118]:
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, median_absolute_error, r2_score

def regression_metrics(y_test, y_pred):
    funcs = dict(
        explained_variance_score=explained_variance_score, 
        mean_absolute_error=mean_absolute_error, 
        mean_squared_error=mean_squared_error,
        median_absolute_error=median_absolute_error,
        r2_score=r2_score,
    )
    result = {k: v(y_test, y_pred) for k, v in funcs.items()}
    for k, v in result.items():
        print(k.title(), v)
    return result

In [124]:
X, y = train[list(set(train.columns) - {'price_doc'})], train['price_doc']

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.25, random_state=42)
clf = RandomForestRegressor()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [126]:
m = regression_metrics(y_test, y_pred)

Explained_Variance_Score 0.615038145335
Mean_Absolute_Error 1571191.24966
Mean_Squared_Error 9.07799983285e+12
Median_Absolute_Error 740358.25
R2_Score 0.614198412793


In [130]:
ranking = dict(zip(X.columns, clf.feature_importances_))

In [134]:
for f, v in sorted(list(ranking.items()), key=lambda x: -x[1])[:10]:
    print(f, v)

full_sq 0.392350665688
cafe_count_5000_price_high 0.0355157335722
cafe_count_3000 0.0296479087603
cafe_count_2000 0.0227638609083
nuclear_reactor_km 0.0213784796685
sport_count_3000 0.0177688363275
office_sqm_5000 0.016115769557
cafe_count_3000_price_2500 0.0142852787354
cafe_count_5000_price_2500 0.0124524726676
cafe_count_1000_price_1500 0.0107487297313


Total area in square meters is being the most important feature with huge gap (not surprised)

In [136]:
train

Unnamed: 0,id,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,...,water_1line_yes,big_road1_1line_no,big_road1_1line_yes,railroad_1line_no,railroad_1line_yes,ecology_excellent,ecology_good,ecology_no data,ecology_poor,ecology_satisfactory
0,1,43,27.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,1,0,1,0,0,1,0,0,0
1,2,34,19.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,1,0,1,0,1,0,0,0,0
2,3,43,29.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,1,0,1,0,0,0,0,1,0
3,4,89,50.0,9.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,1,0,1,0,0,1,0,0,0
4,5,77,77.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,1,0,0,1,1,0,0,0,0
5,6,67,46.0,14.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,1,0,1,0,0,0,0,1,0
6,7,25,14.0,10.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,1,0,1,0,0,0,0,1,0
7,8,44,44.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,1,0,1,0,0,1,0,0,0
8,9,42,27.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,1,0,1,0,0,0,0,1,0
9,10,36,21.0,9.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,1,0,1,0,0,0,0,0,1
