# Preparing model for prediction of Moscov flat prices

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb

np.random.seed(0)

import matplotlib.pyplot as plt

from sklearn.model_selection import learning_curve
from sklearn.model_selection import KFold
from scikitplot.estimators import plot_learning_curve

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

%matplotlib inline

## Import data

In [None]:
train = pd.read_hdf('../input/property.train.h5')
test = pd.read_hdf('../input/property.test.h5') #test data without target values

#concat train & test and next factorize data
df_all = pd.concat([train, test], axis=0)
print("train & test: ", train.shape, test.shape)


#extracting categorical features

black_list = ['Fridge:', 'Furniture:', 'It is possible to bargain:', 'Floor covering:','Kitchen furniture:', 'TV:',
              'Washing machine:', 'Foundation type:', 'Overlap type:',
              'Type of the building:', 'Playground:'] #contain not important features

factorize_feats = [f for f in train.columns if ":" in f]
factorize_feats_light = [f for f in factorize_feats if f not in black_list]
for feat in factorize_feats_light:
    df_all["{}_cat".format(feat)] = df_all[feat].factorize()[0]

## Feature engineering

In [None]:
#extracting index 0 from geo_bloc i breadcrumbs
df_all['geo_block_0'] = df_all['geo_block'].map(lambda x: x[0])
df_all['breadcrumbs_0'] = df_all['breadcrumbs'].map(lambda x: x[0])

ff_0 = [f for f in df_all.columns if '_0' in f]
for feat in ff_0:
    df_all["{}_cat".format(feat)] = df_all[feat].factorize()[0]

In [None]:
#extracting index 1 from geo_bloc & breadcrumbs & date
df_all['geo_block_1'] = df_all['geo_block'].map(lambda x: x[1])
df_all['breadcrumbs_1'] = df_all['breadcrumbs'].map(lambda x: x[1] if len(x) > 1 else -1)
#df_all['date_1'] = df_all['date'].map(lambda x: x[1]) #decrease score of model

ff_1 = [f for f in df_all.columns if '_1' in f]
for feat in ff_1:
    df_all["{}_cat".format(feat)] = df_all[feat].factorize()[0]

#extracting index 2 from geo_bloc & breadcrumbs
df_all['geo_block_2'] = df_all['geo_block'].map(lambda x: x[2] if len(x) > 2 else -1)
df_all['breadcrumbs_2'] = df_all['breadcrumbs'].map(lambda x: x[2] if len(x) > 2 else -1)

ff_2 = [f for f in df_all.columns if '_2' in f]
for feat in ff_2:
    df_all["{}_cat".format(feat)] = df_all[feat].factorize()[0]

#extracting index 0 and 1 from geo_bloc & breadcrumbs together
#df_all['geo_block_01'] = df_all['geo_block'].map(lambda x: x[0:2])
#df_all['breadcrumbs_01'] = df_all['breadcrumbs'].map(lambda x: x[0:2])

#conversion list to string type
#def convert_list_to_string(org_list, seperator=' '):
    #return seperator.join(org_list)

#ff_3 = [f for f in df_all.columns if '_01' in f]
#for feat in ff_3:
    #df_all['{}_str'.format(feat)] = df_all[feat].map(lambda x: ' '.join(x))
    
#ff_4 = [f for f in df_all.columns if '_01_str' in f]
#for feat in ff_4:
    #df_all['{}_cat'.format(feat)] = df_all[feat].factorize()[0] # #decrease score of model

df_all.info()

## Prepare feats,validation and fit XGBoost model

In [None]:
feats = [x for x in df_all.columns if "_cat" in x]
print("feats: ", feats)

# X & y 
df_train, df_test = df_all[ False == df_all["price"].isnull()], df_all[ df_all["price"].isnull()]
X_train, X_test = df_train[feats].values, df_test[feats].values
y_train = df_train["price"].values

#logarithm of y
y_log_train = np.log(y_train)

#validation of model on df_train
from sklearn.metrics import mean_absolute_error

X = df_train[feats].values
y = np.log(df_train['price'].values)

#choosed best model
model = xgb.XGBRegressor(n_estimators=70, learning_rate=0.5, max_depth=8, random_state=0)

#cross-validation
cv = KFold(n_splits=5)

#validation and fit XGBoost model
scores = []
for train_idx, test_idx in cv.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    y_pred = np.exp(y_pred)
    y_test = np.exp(y_test)

    score = mean_absolute_error (y_test, y_pred)
    scores.append( score )
    
    
print(scores, np.mean(scores), np.std(scores))

plot_learning_curve(model, X, y, cv=5, shuffle=True) #checking the overfitting of model

## Extracting model with the best score

In [None]:
cv.split(X)
train_idx, test_idx = list(cv.split(X))[1][0], list(cv.split(X))[1][1]
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

scores = []
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
    
y_pred = np.exp(y_pred)
y_test = np.exp(y_test)

score = mean_absolute_error (y_test, y_pred)
scores.append(score)
    
    
print(scores, np.mean(scores), np.std(scores))

## Predict values on test data

In [None]:
#predict the values on test data
X_train, X_test = df_train[feats].values, df_test[feats].values
y_train = df_train["price"].values
y_log_train = np.log(y_train)

print("predict")
y_log_pred = model.predict(X_test)
#y_pred[y_pred<0] = y_train.min()
y_pred = np.exp(y_log_pred)

print("save file")
test['price'] = y_pred

#save the results for kaggel competition
test[ ['id', 'price'] ].to_csv('rg_v14_xgboost.csv', index=False)