## Load Libs

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import label_binarize
from datetime import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline

## Functions

In [3]:
def normalize(x):
    return (x-np.mean(x))/np.std(x)

def extract_date_variables(input_data, date_in_index=False):
    
    '''This function extracts data variables from a series or index of a series. Returns
        a DataFrame with extracted variables'''

    if date_in_index:
        
        input_data['weeknr'] = map(lambda x: str(x), input_data.index.week)
        input_data['year'] = map(lambda x: str(x), input_data.index.year)
        input_data['month'] = map(lambda x: str(x), input_data.index.month)

        input_data['week_year'] = input_data['weeknr'] + '_' + input_data['year']
        input_data['month_year'] = input_data['month'] + '_' + input_data['year']

        return input_data
    
    else:
        output_data = input_data.to_frame(name='timestamp')
        
        output_data['weeknr'] = map(lambda x: str(x.week), output_data['timestamp'])
        output_data['year'] = map(lambda x: str(x.year), output_data['timestamp'])
        output_data['month'] = map(lambda x: str(x.month), output_data['timestamp'])

        output_data['week_year']  = output_data['weeknr'] + '_' + output_data['year']
        output_data['month_year'] = output_data['month'] + '_' + output_data['year']

        return output_data

## Load Data

In [4]:
df_macro = pd.read_csv('~/Desktop/sberbank/macro_pred.csv', sep=';')
df_train = pd.read_csv('~/Desktop/sberbank/train.csv')
df_test = pd.read_csv('~/Desktop/sberbank/test.csv')

In [5]:
df_test

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
0,30474,2015-07-01,39.00,20.70,2,9,1,1998.0,1,8.9,...,8,0,0,0,1,10,1,0,14,1
1,30475,2015-07-01,79.20,,8,17,1,0.0,3,1.0,...,4,1,1,0,2,11,0,1,12,1
2,30476,2015-07-01,40.50,25.10,3,5,2,1960.0,2,4.8,...,42,11,4,0,10,21,0,10,71,11
3,30477,2015-07-01,62.80,36.00,17,17,1,2016.0,2,62.8,...,1,1,2,0,0,10,0,0,2,0
4,30478,2015-07-01,40.00,40.00,17,17,1,0.0,1,1.0,...,5,1,1,0,2,12,0,1,11,1
5,30479,2015-07-01,48.43,,21,1,1,2015.0,1,1.0,...,143,99,57,12,23,42,1,13,123,7
6,30480,2015-07-01,38.80,,15,17,1,,1,1.0,...,12,7,2,0,5,14,0,3,17,2
7,30481,2015-07-01,43.10,,5,0,1,,1,0.0,...,5,2,2,0,3,12,0,0,6,3
8,30482,2015-07-01,45.40,28.50,9,12,5,1972.0,2,6.0,...,3,1,0,0,1,7,0,0,7,0
9,30483,2015-07-01,43.30,43.30,7,22,1,0.0,1,1.0,...,5,2,1,0,2,9,0,0,7,0


## Process Data

In [6]:
df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

In [7]:
time_data_train = extract_date_variables(df_train['timestamp'])[['timestamp', 'month_year']]
time_data_test  = extract_date_variables(df_test['timestamp'])[['timestamp', 'month_year']]

df_train['month_year'] = time_data_train['month_year']
df_test['month_year'] = time_data_test['month_year']

Merge with macro price

In [8]:
df_train = df_train.merge(df_macro, on='month_year')
df_test  = df_test.merge(df_macro, on='month_year')

In [9]:
y_train = np.log(df_train['price_doc']+1).copy()
X_train = df_train.drop(['price_doc', 'id'], axis=1).copy()

Feature cleaning

In [10]:
def clean_bank(raw_data):
    
    raw_data_features = raw_data.copy()
    
    fsq = np.log(raw_data['full_sq']+1)
    fsq[fsq <1.5] = np.nan
    fsq[fsq >8] = np.nan
    raw_data_features['full_sq'] = fsq

    lsq = np.log(raw_data['life_sq']+1)
    lsq[lsq <1.5] = np.nan
    lsq[lsq >8] = np.nan
    raw_data_features['life_sq'] = lsq


    flr = raw_data_features['floor']
    mflr = raw_data_features['max_floor']

    flr[flr==0] = np.nan
    mflr[mflr==0] = np.nan

    raw_data_features['floor']=flr
    raw_data_features['max_floor']=mflr

    building_type = pd.cut(mflr, np.append(np.arange(0,30), 200))
    raw_data_features['build_type_maxfloor'] = building_type

    # make first floor and last floor as separate var
    raw_data_features['first_floor'] = flr==1
    raw_data_features['last_floor'] = (flr==mflr)

    raw_data.loc[raw_data['material']==3, 'material'] = np.nan
    raw_data_features['material'] = raw_data['material']

    byr = raw_data['build_year'].copy()
    byr[(byr<1000) | (byr>2020)]=np.nan 
    raw_data_features['build_year'] = np.log(byr)

    room_cat = pd.cut(raw_data['num_room'], np.append(np.arange(0,7), 20))
    raw_data_features['num_room']=room_cat

    ksq = raw_data['kitch_sq'].copy()
    ksq[(ksq>250) | (ksq<2)] = np.nan
    raw_data_features['kitch_sq'] = np.log(ksq+1)

    stat = raw_data['state'].copy()
    stat[stat==33] = 3
    raw_data_features['state'] = stat

    
    X_clean = raw_data_features[['full_sq', 'life_sq', 'floor', 'material', 'build_year' ,
                                 'num_room', 'kitch_sq', 'state', 'product_type', 'sub_area',
                                 'first_floor', 'last_floor','build_type_maxfloor', 'macro_price']]

    X_clean['num_room'] = X_clean.num_room.astype('object')
    X_clean['build_type_maxfloor'] = X_clean.build_type_maxfloor.astype('object')


    X_clean['first_floor'] = X_clean.first_floor.astype('int')
    X_clean['last_floor'] = X_clean.last_floor.astype('int')
    
    
    # replace missings
    X_clean.fillna(X_clean.mean(), inplace=True)
    X_clean.loc[X_clean['num_room'].isnull(),'num_room'] = 'not_available'
    X_clean.loc[X_clean['build_type_maxfloor'].isnull(),'build_type_maxfloor'] = 'not_available'
    
    
    # encode dummies
    def makeDum(ser, naam):
        if len(np.unique(ser)) == 2:
            return pd.DataFrame(label_binarize(ser, classes=np.unique(ser)), index=ser.keys(), columns=[naam+'_'+np.unique(ser)[0]] )
        else:
            return pd.DataFrame(label_binarize(ser, classes=np.unique(ser)), index=ser.keys(), columns=naam+'_'+np.unique(ser)) 


    X_clean = X_clean.merge(makeDum(X_clean.num_room, 'num_room').iloc[:,:-1], left_index=True, right_index=True)
    X_clean = X_clean.drop('num_room', axis=1)

    X_clean = X_clean.merge(makeDum(X_clean.product_type, 'product_type').iloc[:,:-1], left_index=True, right_index=True)
    X_clean = X_clean.drop('product_type', axis=1)

    X_clean = X_clean.merge(makeDum(X_clean.sub_area, 'sub_area').iloc[:,:-1], left_index=True, right_index=True)
    X_clean = X_clean.drop('sub_area', axis=1)
        
    X_clean = X_clean.merge(makeDum(X_clean.build_type_maxfloor, 'build_type_maxfloor').iloc[:,:-1], left_index=True, right_index=True)
    X_clean = X_clean.drop('build_type_maxfloor', axis=1)
    
    
    return X_clean

In [11]:
X_train_clean = clean_bank(X_train)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
# there is suddenly an NA in owner-occupier variable

df_test.loc[df_test['product_type'].isnull(), 'product_type'] = df_test['product_type'].mode()[0]

X_test_clean = clean_bank(df_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
# colnames do not match

print X_train_clean.shape
print X_test_clean.shape

missing_names = list(set(X_train_clean.columns) - set(X_test_clean.columns))
print missing_names

X_train_clean = X_train_clean.drop(labels=missing_names, axis=1)
X_train_clean

(30471, 473)
(7662, 472)
['num_room_(6, 20]', 'sub_area_Poselenie Klenovskoe']


Unnamed: 0,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,kitch_sq,state,area_m,...,"build_type_maxfloor_(27, 28]","build_type_maxfloor_(28, 29]","build_type_maxfloor_(29, 200]","build_type_maxfloor_(3, 4]","build_type_maxfloor_(4, 5]","build_type_maxfloor_(5, 6]","build_type_maxfloor_(6, 7]","build_type_maxfloor_(7, 8]","build_type_maxfloor_(8, 9]","build_type_maxfloor_(9, 10]"
0,2011-08-20,3.784190,3.332205,4.0,12.898423,1.827065,7.593495,2.165884,2.105251,6.407578e+06,...,0,0,0,0,0,0,0,0,0,0
1,2011-08-23,3.555348,2.995732,3.0,12.898423,1.827065,7.593495,2.165884,2.105251,9.589337e+06,...,0,0,0,0,0,0,0,0,0,0
2,2011-08-27,3.784190,3.401197,2.0,12.898423,1.827065,7.593495,2.165884,2.105251,4.808270e+06,...,0,0,0,0,0,0,0,0,0,0
3,2011-09-01,4.499810,3.931826,9.0,12.898423,1.827065,7.593495,2.165884,2.105251,1.258354e+07,...,0,0,0,0,0,0,0,0,0,0
4,2011-09-05,4.356709,4.356709,4.0,12.898423,1.827065,7.593495,2.165884,2.105251,8.398461e+06,...,0,0,0,0,0,0,0,0,0,0
5,2011-09-06,4.219508,3.850148,14.0,12.898423,1.827065,7.593495,2.165884,2.105251,7.506452e+06,...,0,0,0,0,0,0,0,0,0,0
6,2011-09-08,3.258097,2.708050,10.0,12.898423,1.827065,7.593495,2.165884,2.105251,1.032047e+07,...,0,0,0,0,0,0,0,0,0,0
7,2011-09-09,3.806662,3.806662,5.0,12.898423,1.827065,7.593495,2.165884,2.105251,6.407578e+06,...,0,0,0,0,0,0,0,0,0,0
8,2011-09-10,3.761200,3.332205,5.0,12.898423,1.827065,7.593495,2.165884,2.105251,4.787424e+06,...,0,0,0,0,0,0,0,0,0,0
9,2011-09-13,3.610918,3.091042,9.0,12.898423,1.827065,7.593495,2.165884,2.105251,5.235177e+07,...,0,0,0,0,0,0,0,0,0,0


Normalize features

# Fit a linreg on the training set

In [13]:
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

regr = linear_model.LinearRegression()
rf   = RandomForestRegressor()

regr.fit(X_train_clean, y_train)
rf.fit(X_train_clean, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [14]:
# The mean squared error
print("Root mean squared error REGRESSION: %.2f"
      % np.sqrt(np.mean((regr.predict(X_train_clean) - y_train) ** 2)))

print("Root mean squared error RANDOM FOREST: %.2f"
      % np.sqrt(np.mean((rf.predict(X_train_clean) - y_train) ** 2)))


Root mean squared error REGRESSION: 0.47
Root mean squared error RANDOM FOREST: 0.21


# Cross Validation

In [27]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import TimeSeriesSplit



def cv_macro_enet(alpha, l1_ratio, n_splits, X, y):

    tscv = TimeSeriesSplit(n_splits=n_splits)
    enet_model = ElasticNet(max_iter=5000, alpha=alpha, l1_ratio=l1_ratio)

    rmse = []
    dims = []
    for train_index, test_index in tscv.split(X):

        enet_model.fit(X.iloc[train_index,:], y.iloc[train_index])
        pred = enet_model.predict(X.iloc[test_index,:])

        rmse.append(np.sqrt(np.mean( (pred-y.iloc[test_index])**2)))
        dims.append((len(y.iloc[train_index].values), len(y.iloc[test_index])))
    
    return [np.mean(rmse), pred, y.iloc[test_index]]

In [28]:
alphas = np.arange(0.01, 1, 0.05)
l1_rats = np.arange(0, 1, 0.1)
print alphas, l1_rats

[ 0.01  0.06  0.11  0.16  0.21  0.26  0.31  0.36  0.41  0.46  0.51  0.56
  0.61  0.66  0.71  0.76  0.81  0.86  0.91  0.96] [ 0.   0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9]


In [29]:
cv_result = [ {'alpha': a, 'l1_rat':l, 'rmse': cv_macro_enet(a,l,10, X_train_clean, y_train)[0]} for a in alphas for l in l1_rats ]

  positive)


KeyboardInterrupt: 

In [None]:
pd.DataFrame(cv_result).sort_values('rmse')

# Predict on the Testset

In [19]:
regr.fit(X_train_clean, y_train)
submission_pred = regr.predict(X_test_clean)

In [20]:
submission_pred = np.exp(submission_pred)-1

In [21]:
print submission_pred

[ 4991539.00407912  8279064.40383018  5144493.49472311 ...,
  5244539.96523729  4903309.58134972  8281197.93082849]


In [22]:
df_sub = pd.DataFrame()
df_sub['id'] = df_test['id'].copy()
df_sub['price_doc'] = submission_pred
df_sub.to_csv('./submission4_with_macro.csv')

# Deviations from MacroPrice

In [21]:
from sklearn import linear_model
regr_dev = linear_model.LinearRegression()

In [22]:
y_train_dev = y_train - X_train_clean['macro_price'].values

In [23]:
X_train_dev = X_train.drop('macro_price', axis=1)

In [28]:
X_train_dev
#regr_dev.fit(X_train_dev, y_train_dev.values)

Unnamed: 0,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,month_year
0,2011-08-20,43,27.0,4.0,,,,,,,...,9,4,0,13,22,1,0,52,4,8_2011
1,2011-08-23,34,19.0,3.0,,,,,,,...,15,3,0,15,29,1,10,66,14,8_2011
2,2011-08-27,43,29.0,2.0,,,,,,,...,10,3,0,11,27,0,4,67,10,8_2011
3,2011-09-01,89,50.0,9.0,,,,,,,...,11,2,1,4,4,0,0,26,3,9_2011
4,2011-09-05,77,77.0,4.0,,,,,,,...,319,108,17,135,236,2,91,195,14,9_2011
5,2011-09-06,67,46.0,14.0,,,,,,,...,62,14,1,53,78,1,20,113,17,9_2011
6,2011-09-08,25,14.0,10.0,,,,,,,...,81,16,3,38,80,1,27,127,8,9_2011
7,2011-09-09,44,44.0,5.0,,,,,,,...,9,4,0,11,18,1,0,47,4,9_2011
8,2011-09-10,42,27.0,5.0,,,,,,,...,19,8,1,18,34,1,3,85,11,9_2011
9,2011-09-13,36,21.0,9.0,,,,,,,...,19,13,0,10,20,1,3,67,1,9_2011
