In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import operator
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor


random_state=7

%matplotlib inline
pd.set_option('display.max_columns', 500)

# Reading Data

In [2]:
train=pd.read_csv('input/train.csv', parse_dates=['timestamp'])
test=pd.read_csv('input/test.csv', parse_dates=['timestamp'])
macro=pd.read_csv('input/macro.csv', parse_dates=['timestamp'])

test_id=test['id']

print('The shape of training data is', train.shape)
print('The shape of test data is', test.shape)
#print('The shape of macro data is', macro.shape)


#fts contains the feature names (exclude id and year)
fts=list(train.columns[1:-1])

The shape of training data is (30471, 292)
The shape of test data is (7662, 291)


In [3]:
trainsub=train[train.timestamp<'2015-01-01']
trainsub=trainsub[trainsub.product_type=='Investment']
print('shape of the investment type', trainsub.shape)

ind_1m=trainsub[trainsub.price_doc <= 1000000].index
ind_2m=trainsub[trainsub.price_doc == 2000000].index
ind_3m=trainsub[trainsub.price_doc==3000000].index

print('num of ind_1m', len(ind_1m))
print('num of ind_2m', len(ind_2m))
print('num of ind_3m', len(ind_3m))

train_index=set(train.index.copy())

for ind, gap in zip([ind_1m, ind_2m, ind_3m], [10, 3, 2]):
    ind_set=set(ind)
    ind_set_cut=ind.difference(set(ind[::gap]))
    
    train_index=train_index.difference(ind_set_cut)
    
train=train.loc[train_index]

target=np.log(train.price_doc+1)
#number of training and test example
n_train=train.shape[0]
n_test=test.shape[0]

shape of the investment type (17693, 292)
num of ind_1m 930
num of ind_2m 680
num of ind_3m 292


In [122]:
print('num of train', n_train)
print('num of test', n_test)
print('num of target', len(target))

num of train 29035
num of test 7662
num of target 29035


In [124]:
target_transfer=train.price_doc
%store target_transfer

Stored 'target_transfer' (Series)


# Combine train and test and Change Categorical Data to Dummies

In [4]:
#concatenate training and test
raw_data=pd.concat([train.loc[:,fts[0]:fts[-1]], test.loc[:, fts[0]:fts[-1]]])
print('shape of train plus test', raw_data.shape)

shape of train plus test (36697, 290)


# Look at Nas in important features

In [5]:
raw_data_nas=raw_data.isnull().sum()
raw_data_nas=raw_data_nas.sort_values(ascending=False)
raw_data_nas=pd.DataFrame(raw_data_nas, columns=['na_counts'])
raw_data_nas=raw_data_nas[raw_data_nas.na_counts>0.1*(n_train+n_test)]

feature_na=set(raw_data_nas.index)
print('number of elements in feature_na', len(feature_na))

feature_important=pd.read_csv('features_importance_rate_0.01_withallmacro')
feature_important=set(feature_important.feature.values)
print('number of features in feature_importance', len(feature_important))

feature_intersect=feature_important & feature_na
print('number of feature_intersect', len(feature_intersect))

number of elements in feature_na 35
number of features in feature_importance 208
number of feature_intersect 25


In [6]:
raw_data[list(feature_intersect)].iloc[1:20,:]

#if build year if larger than 2019 or smaller than 1800 change them to median
raw_data['build_year'][(raw_data.build_year<1800) | (raw_data.build_year>2019)]=raw_data.build_year.median()

#if full sq is zero, assign it to median
raw_data['full_sq'][raw_data.full_sq==0]=raw_data.full_sq.median()

#if max_floor is zero, assign it to median
raw_data['max_floor'][raw_data.max_floor==0]=raw_data.full_sq.median()

#preschool quota is zero assign it to median
raw_data['preschool_quota'][raw_data.preschool_quota==0]=raw_data.preschool_quota.median()

#if kitch_sq is too large or larger than life_sq use 20% full_sq
raw_data['kitch_sq'][(raw_data.kitch_sq > 50) | (raw_data.kitch_sq > raw_data.life_sq)]=kitch_est=raw_data['full_sq'][(raw_data.kitch_sq > 50) | (raw_data.kitch_sq > raw_data.life_sq)]*0.2

#if life_sq is na or life_sq is too small or too large, change it to 0.7 full_sq
raw_data['life_sq'][raw_data.life_sq.isnull()]=raw_data['full_sq'][raw_data.life_sq.isnull()]*0.7
raw_data['life_sq'][(raw_data.life_sq<0.1*raw_data.full_sq)| (raw_data.life_sq>raw_data.full_sq)] = raw_data['full_sq'][(raw_data.life_sq<0.1*raw_data.full_sq)| (raw_data.life_sq>raw_data.full_sq)]

#fill no data with satisfactory
raw_data['ecology'][raw_data.ecology=='no data']='satisfactory'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [7]:
#Add month-year count
month_year = (raw_data.timestamp.dt.month + raw_data.timestamp.dt.year*100)
month_year_cnt_map=month_year.value_counts().to_dict()
raw_data['month_year_cnt']=month_year.map(month_year_cnt_map)

#Add week-year count
week_year=(raw_data.timestamp.dt.week + raw_data.timestamp.dt.year*100)
week_year_cnt_map=week_year.value_counts().to_dict()
raw_data['week_year_cnt']=week_year.map(week_year_cnt_map)

#Add month and day-of-week
raw_data['month']=raw_data.timestamp.dt.month
raw_data['dow']=raw_data.timestamp.dt.month

In [8]:
selected_f=["timestamp", "balance_trade_growth", "eurrub", "average_provision_of_build_contract", 
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", 
"mortgage_rate", "income_per_cap", "rent_price_4+room_bus","museum_visitis_per_100_cap","apartment_build"]

selected_macro=macro[selected_f]

In [9]:
raw_data=pd.merge(raw_data, selected_macro, how='left', on='timestamp')
print('shape of the merged data', raw_data.shape)

shape of the merged data (36697, 306)


In [10]:
data=pd.get_dummies(raw_data)
print('shape after get_dummies', data.shape)

#change timestamp to year
data['year']=data['timestamp'].dt.year.astype(int)
data.drop('timestamp', axis=1, inplace=True)

shape after get_dummies (36697, 466)


# Fill NaN with median values

In [11]:
#when using get_dummies the Nan in categorical data are ignored. The possible Nan are numbers. fill them with mean
print('number of NaN in train and test', data.isnull().sum().sum())

data=data.fillna(data.median())

print('number of NaN in train and test', data.isnull().sum().sum())

number of NaN in train and test 333386
number of NaN in train and test 0


In [12]:
#Add more features:

#relative floor
data['relative_floor']=data['floor'].div(data['max_floor'].astype(float))

#relative life_sq
data['relative_life_sq']=data['life_sq'].div(data['full_sq'].astype(float))

#ratio of number of pupils and preschool seats
data['ratio_preschool']=data['children_preschool'].div(data['preschool_quota'].astype(float))

#ratio of number of pupils and school seats
data['ratio_school']=data['children_school'].div(data['school_quota'].astype(float))

#young porpulation ratio
data['ratio_young']=data['young_all'].div(data['full_all'].astype(float))

# Try standardscaler before fitting

In [13]:
from sklearn.preprocessing import StandardScaler
col_names=list(data.columns)

ss=StandardScaler(with_mean=False, with_std=True)
data_std=ss.fit_transform(data)
data_std=pd.DataFrame(data_std, columns=col_names)

train_std=data_std.iloc[:n_train, :]
test_std=data_std.iloc[n_train :, :]

# Random Forest

In [14]:
from sklearn.ensemble import RandomForestRegressor

In [15]:
#train rf:

rf=RandomForestRegressor(n_estimators=800, n_jobs=-1, 
                         max_features='auto', max_depth=10, verbose=1)
rf.fit(train_std, target)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 26.9min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed: 27.2min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=800, n_jobs=-1, oob_score=False, random_state=None,
           verbose=1, warm_start=False)

In [57]:
rf_importance=rf.feature_importances_
rf_dict=dict()

for f, importance in zip (train_std.columns, rf_importance):
    rf_dict[f]=importance
    
rf_dict=sorted(rf_dict.items(), key=operator.itemgetter(1), reverse=True)
rf_dict=pd.DataFrame(rf_dict, columns=['feature', 'score'])
#rf_dict.to_csv(path_or_buf='submissions/0527/rf_feature_importance.csv')

importance_rate=0.01
rf_index=rf_dict[rf_dict.score>importance_rate*rf_dict.score.max()].feature.values
print('number of rf_index', len(rf_index))

number of rf_index 14


In [58]:
rf_new=RandomForestRegressor(n_estimators=800, n_jobs=-1,
                            max_features='auto', max_depth=10, verbose=1)
rf_new.fit(train_std[rf_index], target)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.3min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=800, n_jobs=-1, oob_score=False, random_state=None,
           verbose=1, warm_start=False)

In [59]:
in_rf=rf_new.predict(train_std[rf_index])
in_rf=np.exp(in_rf)-1
insample_rf=pd.DataFrame({'id': train.id, 'price_doc_rf': in_rf})
insample_rf.to_csv(path_or_buf='ensamble/rf_train_0.01importance.csv', index=False)

pre_rf=rf_new.predict(test_std[rf_index])
pre_rf=np.exp(pre_rf)-1

submission=pd.DataFrame({'id': test_id, 'price_doc': pre_rf})
submission.to_csv(path_or_buf='ensamble/170527subission_rf_with_12macro_cleandata_8newfeature.cs_0.01importv',index=False)
submission.head()

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed:    1.5s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed:    0.7s finished


Unnamed: 0,id,price_doc
0,30474,5101030.540339
1,30475,8362429.300093
2,30476,6274865.593472
3,30477,6374278.885423
4,30478,5145806.001251


# XGBOOST

In [60]:
import xgboost as xgb



In [61]:
from sklearn.model_selection import train_test_split
train_part, val_train, target_part, val_target=train_test_split(train_std, target, test_size=0.2, 
                                                              random_state=random_state)
print('shape of train_part', train_part.shape)
print('shape of target_part', target_part.shape)
print('shape of val_train', val_train.shape)
print('shape of val_target', val_target.shape)

shape of train_part (23228, 471)
shape of target_part (23228,)
shape of val_train (5807, 471)
shape of val_target (5807,)


In [73]:
cols=train_part.columns
len(cols)

471

In [87]:
dtrain=xgb.DMatrix(train_part, target_part)
dval=xgb.DMatrix(val_train, val_target)
dtest=xgb.DMatrix(test_std)

xgb_params = {
    'eta': 0.02,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0,
    'lambda': 100,
    'base_score': 7
}

model=xgb.train(xgb_params, dtrain, num_boost_round=2000, 
                evals=[(dval, 'validation')], early_stopping_rounds=20,
               verbose_eval=100)
num_boost_round=model.best_iteration

[0]	validation-rmse:8.52292
Will train until validation-rmse hasn't improved in 20 rounds.
[100]	validation-rmse:1.22996
[200]	validation-rmse:0.388079
[300]	validation-rmse:0.332577
[400]	validation-rmse:0.326317
[500]	validation-rmse:0.323773
[600]	validation-rmse:0.32226
[700]	validation-rmse:0.32147
[800]	validation-rmse:0.321004
[900]	validation-rmse:0.320729
[1000]	validation-rmse:0.320453
Stopping. Best iteration:
[988]	validation-rmse:0.320446



In [65]:
score=model.get_fscore()
score=sorted(score.items(), key=operator.itemgetter(1), reverse=True)
score=pd.DataFrame(score, columns=['feature', 'fscore'])
score.to_csv(path_or_buf='submissions/scores/xgb_score.csv')

importance_rate=0.03
f_index=score[score.fscore>importance_rate*score.fscore.max()].feature.values
print('number of f_index', len(f_index))

number of f_index 133


In [88]:
train_select=pd.DataFrame(train_part)
train_select=train_select[f_index]

val_select=pd.DataFrame(val_train)
val_select=val_select[f_index]

dtrain_select=xgb.DMatrix(train_select, target_part)
dval_select=xgb.DMatrix(val_select, val_target)

xgb_params = {
    'eta': 0.02,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0,
    'lambda': 100,
    'base_score': 7
}

model=xgb.train(xgb_params, dtrain_select, num_boost_round=2000, 
                evals=[(dval_select, 'validation')], early_stopping_rounds=20,
               verbose_eval=100)
num_boost_round=model.best_iteration

[0]	validation-rmse:8.52292
Will train until validation-rmse hasn't improved in 20 rounds.
[100]	validation-rmse:1.23039
[200]	validation-rmse:0.389826
[300]	validation-rmse:0.333597
[400]	validation-rmse:0.326868
[500]	validation-rmse:0.324236
[600]	validation-rmse:0.322624
[700]	validation-rmse:0.321829
[800]	validation-rmse:0.32117
[900]	validation-rmse:0.320711
Stopping. Best iteration:
[932]	validation-rmse:0.320608



In [90]:
#train xgb with full train data
xgb_params = {
    'eta': 0.02,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0,
    'lambda': 100,
    'base_score': 7
}

Ddata_std=xgb.DMatrix(train_std[f_index], target)
#Ddata_std=xgb.DMatrix(train_std, target)
full_model=xgb.train(xgb_params, Ddata_std, num_boost_round=num_boost_round,
                    verbose_eval=100)
#xgb.cv(xgb_params, Ddata_std, num_boost_round=3000, 
#       nfold=5, verbose_eval=100, early_stopping_rounds=20)

In [119]:
in_xgb=full_model.predict(Ddata_std)
in_xgb=np.exp(in_xgb)-1
insample_xgb=pd.DataFrame({'id': train.id, 'price_doc_xgb': in_xgb})
insample_xgb.to_csv(path_or_buf='ensamble/xgb_train_0.03importance.csv', index=False)

dtest_std=xgb.DMatrix(test_std[f_index])
pre_xgb=full_model.predict(dtest_std)
pre_xgb=np.exp(pre_xgb)-1

submission=pd.DataFrame({'id': test_id, 'price_doc_xgb': pre_xgb})
submission.to_csv(path_or_buf='ensamble/170527subission_xgb_with_12macro_cleandata_8newfeature_0.03import.csv',index=False)
submission.head()


Unnamed: 0,id,price_doc_xgb
0,30474,5619100.0
1,30475,8844294.0
2,30476,5284690.0
3,30477,6106908.0
4,30478,5312507.5


In [118]:
pre_xgb=np.exp(pre_xgb)-1

submission=pd.DataFrame({'id': test_id, 'price_doc': pre_xgb})
submission.to_csv(path_or_buf='170525subission_xgb_with_12macro_cleandata_8newfeature_nosubsample_0.03impoartance.csv',index=False)
submission.head()

Unnamed: 0,id,price_doc
0,30474,5860943.5
1,30475,8614750.0
2,30476,4888695.5
3,30477,6142954.0
4,30478,5388001.5


# GBRT

In [20]:
from sklearn.ensemble import GradientBoostingRegressor

GBRT_clf=GradientBoostingRegressor(n_estimators=300,learning_rate=0.03,
                                   verbose=1, random_state=random_state)

#clf.fit(train_std[f_index], target)
#clf.fit(train_std, target)

GBRT_params={'loss': ['ls', 'lad'],
            'max_features': ['auto', 'sqrt'],
            'subsample': [0.6, 0.7, 0.8, 0.9],
            'max_leaf_nodes': [7,8,9,10]}

GBRT_cv=GridSearchCV(GBRT_clf, GBRT_params, cv=5, n_jobs=-1)
GBRT_cv.fit(train_std, target)

In [96]:
GBRT_clf=GradientBoostingRegressor(n_estimators=600,learning_rate=0.02,
                                   verbose=0, random_state=random_state,
                                  subsample=0.9, max_leaf_nodes=10)

GBRT_clf.fit(train_std, target)

# pre_GBRT=GBRT_clf.predict(test_std)
# pre_GBRT=np.exp(pre_GBRT)-1

# submission=pd.DataFrame({'id': test_id, 'price_doc': pre_GBRT})
# submission.to_csv(path_or_buf='170526submission_GBRT_with_12features_cleandata_macro.csv',index=False)
# submission.head()

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.02, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=10, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=600,
             presort='auto', random_state=7, subsample=0.9, verbose=0,
             warm_start=False)

In [116]:
GBRT_dict=dict()
for f, importance in zip(train_std.columns, GBRT_clf.feature_importances_):
    GBRT_dict[f]=importance

GBRT_dict=sorted(GBRT_dict.items(), key=operator.itemgetter(1), reverse=True)
GBRT_dict=pd.DataFrame(GBRT_dict, columns=['feature', 'GBRT_score'])
GBRT_dict.to_csv(path_or_buf='submissions/scores/GBRT_score.csv')

importance_rate=0.01
GBRT_index=GBRT_dict[xgb_dict.GBRT_score>importance_rate*GBRT_dict.GBRT_score.max()].feature.values
print('number of f_index', len(GBRT_index))

number of f_index 135


In [117]:
GBRT_fullmode=GradientBoostingRegressor(n_estimators=600,learning_rate=0.02,
                                   verbose=0, random_state=random_state,
                                  subsample=0.9, max_leaf_nodes=10)

#scores = cross_val_score(GBRT_cv, train_std[GBRT_index], target, cv=5)
#print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))
GBRT_fullmode.fit(train_std[GBRT_index], target)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.02, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=10, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=600,
             presort='auto', random_state=7, subsample=0.9, verbose=0,
             warm_start=False)

In [118]:
in_GBRT=GBRT_fullmode.predict(train_std[GBRT_index])
in_GBRT=np.exp(in_GBRT)-1
insample_GBRT=pd.DataFrame({'id': train.id, 'price_doc_GBRT': in_GBRT})
insample_GBRT.to_csv(path_or_buf='ensamble/GBRT_train_0.03importance.csv', index=False)


pre_GBRT=GBRT_fullmode.predict(test_std[GBRT_index])
pre_GBRT=np.exp(pre_GBRT)-1

submission=pd.DataFrame({'id': test_id, 'price_doc_GBRT': pre_GBRT})
submission.to_csv(path_or_buf='ensamble/170527submission_GBRT_with_12features_macro_0.03import.csv',index=False)
submission.head()

Unnamed: 0,id,price_doc_GBRT
0,30474,5633902.113439
1,30475,8287396.831267
2,30476,5465193.683202
3,30477,5576789.08992
4,30478,5103606.320849
