# Load Libs and Data

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import label_binarize
from datetime import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
df_macro = pd.read_csv('~/Desktop/sberbank/macro.csv')
df_train = pd.read_csv('~/Desktop/sberbank/train.csv')
df_test = pd.read_csv('~/Desktop/sberbank/test.csv')

In [13]:
y_train = np.log(df_train['price_doc']+1).copy()
X_train = df_train.drop(['price_doc', 'id'], axis=1).copy()


Feature cleaning

In [14]:
def clean_bank(raw_data):
    
    raw_data_features = raw_data.copy()
    
    fsq = np.log(raw_data['full_sq']+1)
    fsq[fsq <1.5] = np.nan
    fsq[fsq >8] = np.nan
    raw_data_features['full_sq'] = fsq

    lsq = np.log(raw_data['life_sq']+1)
    lsq[lsq <1.5] = np.nan
    lsq[lsq >8] = np.nan
    raw_data_features['life_sq'] = lsq


    flr = raw_data_features['floor']
    mflr = raw_data_features['max_floor']

    flr[flr==0] = np.nan
    mflr[mflr==0] = np.nan

    raw_data_features['floor']=flr
    raw_data_features['max_floor']=mflr

    building_type = pd.cut(mflr, np.append(np.arange(0,30), 200))
    raw_data_features['build_type_maxfloor'] = building_type

    # make first floor and last floor as separate var
    raw_data_features['first_floor'] = flr==1
    raw_data_features['last_floor'] = (flr==mflr)

    raw_data.loc[raw_data['material']==3, 'material'] = np.nan
    raw_data_features['material'] = raw_data['material']

    byr = raw_data['build_year'].copy()
    byr[(byr<1000) | (byr>2020)]=np.nan 
    raw_data_features['build_year'] = np.log(byr)

    room_cat = pd.cut(raw_data['num_room'], np.append(np.arange(0,7), 20))
    raw_data_features['num_room']=room_cat

    ksq = raw_data['kitch_sq'].copy()
    ksq[(ksq>250) | (ksq<2)] = np.nan
    raw_data_features['kitch_sq'] = np.log(ksq+1)

    stat = raw_data['state'].copy()
    stat[stat==33] = 3
    raw_data_features['state'] = stat

    
    X_clean = raw_data_features[['full_sq', 'life_sq', 'floor', 'material', 'build_year' ,'num_room', 'kitch_sq', 'state', 'product_type', 'sub_area','first_floor', 'last_floor','build_type_maxfloor']]

    X_clean['num_room'] = X_clean.num_room.astype('object')
    X_clean['build_type_maxfloor'] = X_clean.build_type_maxfloor.astype('object')


    X_clean['first_floor'] = X_clean.first_floor.astype('int')
    X_clean['last_floor'] = X_clean.last_floor.astype('int')
    
    
    # replace missings
    X_clean.fillna(X_clean.mean(), inplace=True)
    X_clean.loc[X_clean['num_room'].isnull(),'num_room'] = 'not_available'
    X_clean.loc[X_clean['build_type_maxfloor'].isnull(),'build_type_maxfloor'] = 'not_available'
    
    
    # encode dummies
    def makeDum(ser, naam):
        if len(np.unique(ser)) == 2:
            return pd.DataFrame(label_binarize(ser, classes=np.unique(ser)), index=ser.keys(), columns=[naam+'_'+np.unique(ser)[0]] )
        else:
            return pd.DataFrame(label_binarize(ser, classes=np.unique(ser)), index=ser.keys(), columns=naam+'_'+np.unique(ser)) 


    X_clean = X_clean.merge(makeDum(X_clean.num_room, 'num_room').iloc[:,:-1], left_index=True, right_index=True)
    X_clean = X_clean.drop('num_room', axis=1)

    X_clean = X_clean.merge(makeDum(X_clean.product_type, 'product_type').iloc[:,:-1], left_index=True, right_index=True)
    X_clean = X_clean.drop('product_type', axis=1)

    X_clean = X_clean.merge(makeDum(X_clean.sub_area, 'sub_area').iloc[:,:-1], left_index=True, right_index=True)
    X_clean = X_clean.drop('sub_area', axis=1)
        
    X_clean = X_clean.merge(makeDum(X_clean.build_type_maxfloor, 'build_type_maxfloor').iloc[:,:-1], left_index=True, right_index=True)
    X_clean = X_clean.drop('build_type_maxfloor', axis=1)    
    
    return X_clean

Select clean features only

In [15]:
X_train_clean = clean_bank(X_train)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See th

In [16]:
# there is suddenly an NA in owner-occupier variable

df_test.loc[df_test['product_type'].isnull(), 'product_type'] = df_test['product_type'].mode()[0]

X_test_clean = clean_bank(df_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See th

In [17]:
# colnames do not match

print X_train_clean.shape
print X_test_clean.shape

missing_names = list(set(X_train_clean.columns) - set(X_test_clean.columns))
print missing_names

X_train_clean = X_train_clean.drop(labels=missing_names, axis=1)
X_train_clean

(30471, 191)
(7662, 189)
['num_room_(6, 20]', 'sub_area_Poselenie Klenovskoe']


Unnamed: 0,full_sq,life_sq,floor,material,build_year,kitch_sq,state,first_floor,last_floor,"num_room_(0, 1]",...,"build_type_maxfloor_(27, 28]","build_type_maxfloor_(28, 29]","build_type_maxfloor_(29, 200]","build_type_maxfloor_(3, 4]","build_type_maxfloor_(4, 5]","build_type_maxfloor_(5, 6]","build_type_maxfloor_(6, 7]","build_type_maxfloor_(7, 8]","build_type_maxfloor_(8, 9]","build_type_maxfloor_(9, 10]"
0,3.784190,3.332205,4.0,1.827065,7.593495,2.165884,2.105251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3.555348,2.995732,3.0,1.827065,7.593495,2.165884,2.105251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.784190,3.401197,2.0,1.827065,7.593495,2.165884,2.105251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.499810,3.931826,9.0,1.827065,7.593495,2.165884,2.105251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.356709,4.356709,4.0,1.827065,7.593495,2.165884,2.105251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4.219508,3.850148,14.0,1.827065,7.593495,2.165884,2.105251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,3.258097,2.708050,10.0,1.827065,7.593495,2.165884,2.105251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,3.806662,3.806662,5.0,1.827065,7.593495,2.165884,2.105251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,3.761200,3.332205,5.0,1.827065,7.593495,2.165884,2.105251,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3.610918,3.091042,9.0,1.827065,7.593495,2.165884,2.105251,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Fit a linreg on the training set

In [18]:
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

regr = linear_model.LinearRegression()
rf   = RandomForestRegressor()

#regr.fit(X_clean, target)
#rf.fit(X_clean, target)

# Cross Validation

In [20]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error


#mean_squared_error(ytest,ypred)

k_fold = KFold(n_splits=10)


In [21]:
regr_results = []
rf_results = []

for train, test in k_fold.split(X_train_clean):
    regr.fit(X_train_clean.loc[train,:], y_train[train])
    regr_results.append(mean_squared_error( regr.predict(X_train_clean.loc[test,:]), y_train[test] ) )
    
    rf.fit(X_train_clean.loc[train,:], y_train[train])
    rf_results.append(mean_squared_error( rf.predict(X_train_clean.loc[test,:]), y_train[test] ) )

    
    
print regr_results
print rf_results  
    

[0.48234523463071788, 0.18434196861255997, 0.1776081633997291, 0.20584605238944717, 0.22422933443946119, 0.22825385138711188, 0.23139928807933094, 0.1934493423766605, 0.19597270899499503, 0.17538722692072861]
[0.55214332462865112, 0.22778238198588072, 0.2174860957468728, 0.24150046703061595, 0.26691753213275238, 0.25086956866385296, 0.25745955559684131, 0.2188458776611294, 0.22532775727525545, 0.20895816537109924]


# Predict on the Testset

In [23]:
regr.fit(X_train_clean, y_train)
submission_pred = regr.predict(X_test_clean)

In [24]:
submission_pred = np.exp(submission_pred)-1

In [25]:
print submission_pred

[ 4726180.60226424  7971024.4809037   4884681.73054696 ...,
  5258228.35509862  4914362.64302078  8313414.20133331]


In [26]:
df_sub = pd.DataFrame()
df_sub['id'] = df_test['id'].copy()
df_sub['price_doc'] = submission_pred
df_sub.to_csv('./submission2.csv')