## <font color='green'>Setup 0</font>: Envrionment Setup & Import Data

In [357]:
import warnings
warnings.filterwarnings('ignore')
import time
import os
import numpy as np
import pandas as pd
import pickle
pd.options.display.max_columns = None
pd.options.display.mpl_style = 'default'

from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso,ElasticNet,LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn import svm
from sklearn.metrics import mean_squared_error, make_scorer,mean_absolute_error
from  collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline

In [358]:
PROCESSINGTEXT_DIR =  '../home_depot/data'
df_train = pd.read_csv(PROCESSINGTEXT_DIR + '/merged_train_df.csv', encoding="ISO-8859-1")
df_test = pd.read_csv(PROCESSINGTEXT_DIR + '/merged_test_df.csv', encoding="ISO-8859-1")

## <font color='green'>Setup 1</font>: Define Train/Test Sets

In [360]:
df_test = df_test.drop('relevance',axis=1)
df_score = pd.read_csv('../IRDM/data/solution.csv', encoding="ISO-8859-1")
df_test = pd.merge(df_test, df_score, on='id')
df_test = df_test[df_test['Usage'] != 'Ignored']

In [473]:
test_count= Counter(df_test['search_term_id'])
df_test['query_freq'] = df_test['search_term_id'].map(lambda x:test_count[x])

In [474]:
df_test_pair_wise = df_test[df_test['query_freq']>4]

In [363]:
id_test_pair_wise = df_test_pair_wise['id']
search_term_id_test_pair_wise = df_test_pair_wise['search_term_id']

y_test_pair_wise = df_test_pair_wise['relevance'].values

X_test_pair_wise = df_test_pair_wise.drop(['id','relevance','Usage','query_freq'],axis=1).values

In [364]:
id_test = df_test['id']
search_term_id_test = df_test['search_term_id']
id_train = df_train['id']
search_term_id_train = df_train['search_term_id']
y_train = df_train['relevance'].values
y_test = df_test['relevance'].values

X_train = df_train.drop(['id','relevance','query_bullets_material_matching','query_decription_material_matching','query_brand_matching'],axis=1).values
X_test = df_test.drop(['id','relevance','Usage','query_freq','query_bullets_material_matching','query_decription_material_matching','query_brand_matching'],axis=1).values

## <font color='green'>Setup 2</font>: Construct the Model

### sklearn.linear_model

In [365]:
clf_lr = LinearRegression(fit_intercept=True, normalize=True, copy_X=True, n_jobs=1)
t = time.time()
clf_lr.fit(X_train,y_train)
print('model training time:',round((time.time()-t)/60,3) ,'minutes\n')
t = time.time()
y_pred_lr = clf_lr.predict(X_test)
y_pred_pair_wise_lr = clf_lr.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.467252897931   private score:  0.466536583211   average score:  0.466751771442

model training time: 0.004 minutes

predicting time: 0.001 minutes



In [369]:
clf_lasso = Lasso(alpha=0.00001)
t = time.time()
clf_lasso.fit(X_train, y_train)
print('model training time:',round((time.time()-t)/60,3) ,'minutes\n')
t = time.time()
y_pred_lasso = clf_lasso.predict(X_test)
y_pred_pair_wise_lasso = clf_lasso.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.467129328129   private score:  0.46644225209   average score:  0.466648652226

model training time: 0.08 minutes

predicting time: 0.001 minutes



In [370]:
clf_ridge = Ridge(alpha=0.5)
t = time.time()
clf_ridge.fit(X_train, y_train)
print('model training time:',round((time.time()-t)/60,3) ,'minutes\n')
t = time.time()
y_pred_rige = clf_ridge.predict(X_test)
y_pred_pair_wise_rige = clf_ridge.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.467147977456   private score:  0.466460599048   average score:  0.466667090059

model training time: 0.002 minutes

predicting time: 0.001 minutes



In [371]:
clf_EN = ElasticNet(alpha=0.00001, l1_ratio=0.7)
t = time.time()
clf_EN.fit(X_train, y_train)
print('model training time:',round((time.time()-t)/60,3) ,'minutes\n')
t = time.time()
y_pred_EN = clf_EN.predict(X_test)
y_pred_pair_wise_EN = clf_EN.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.467132273987   private score:  0.466447202556   average score:  0.46665300019

model training time: 0.086 minutes

predicting time: 0.001 minutes



### sklearn.neural_network

### sklearn.ensemble

In [372]:
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-6.3.0-posix-seh-rt_v5-rev1\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
import xgboost as xgb

In [373]:
clf_AB = pickle.load(open('clf_AB.sav','rb'))
clf_rf = pickle.load(open('clf_rf.sav','rb'))
clf_ET = pickle.load(open('clf_ET.sav','rb'))
clf = pickle.load(open('clf.sav','rb'))
clf_xgb = pickle.load(open('clf_xgb.sav','rb'))
clf_ABbagging = pickle.load(open('clf_ABbagging.sav','rb'))
clf_rfbagging = pickle.load(open('clf_rfbagging.sav','rb'))
clf_ETbagging = pickle.load(open('clf_ETbagging.sav','rb'))
clf_xgbbagging = pickle.load(open('clf_xgbbagging.sav','rb'))
clf_svr = pickle.load(open('clf_svr.sav','rb'))

In [374]:
clf_GB = pickle.load(open('clf_GB.sav','rb'))
clf_GBbagging = pickle.load(open('clf_GBbagging.sav','rb'))

In [375]:
t = time.time()
y_pred_AB = clf_AB.predict(X_test)
y_pred_pair_wise_AB = clf_AB.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.47822394677   private score:  0.477928687053   average score:  0.478017357634

t = time.time()
y_pred_ABbagging = clf_ABbagging.predict(X_test)
y_pred_pair_wise_ABbagging = clf_ABbagging.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.482595622317   private score:  0.482664808561   average score:  0.482644036505

predicting time: 0.038 minutes

predicting time: 1.87 minutes



In [376]:
t = time.time()
y_pred_rf = clf_rf.predict(X_test)
y_pred_pair_wise_rf = clf_rf.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.459624923702   private score:  0.45949920384   average score:  0.459536954711

t = time.time()
y_pred_rfbagging = clf_rfbagging.predict(X_test)
y_pred_pair_wise_rfbagging = clf_rfbagging.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.458506149542   private score:  0.458063795194   average score:  0.458196656488

predicting time: 0.085 minutes

predicting time: 3.13 minutes



In [377]:
t = time.time()
y_pred_ET = clf_ET.predict(X_test)
y_pred_pair_wise_ET = clf_ET.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.458415690893   private score:  0.458139769673   average score:  0.458222632186

t = time.time()
y_pred_ETbagging = clf_ETbagging.predict(X_test)
y_pred_pair_wise_ETbagging = clf_ETbagging.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.457843408244   private score:  0.457141823561   average score:  0.45735258669

predicting time: 0.29 minutes

predicting time: 10.282 minutes



In [None]:
import pickle
clf_GBbagging2 = pickle.load(open('clf_GBbagging2.sav','rb'))
X_test2 = df_test.drop(['id','relevance','Usage','query_bullets_material_matching','query_decription_material_matching'],axis=1).values
y_pred_GBbagging2 = clf_GBbagging2.predict(X_test2)

In [380]:
t = time.time()
y_pred_GB = clf_GB.predict(X_test)
y_pred_pair_wise_GB = clf_GB.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.456267609619   private score:  0.4551328414   average score:  0.455473851368

# t = time.time()
# y_pred_GBbagging = clf_GBbagging.predict(X_test)
# y_pred_pair_wise_GBbagging = clf_GBbagging.predict(X_test_pair_wise)
# print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# # public score:  0.455671294414   private score:  0.454882006571   average score:  0.455119133218

predicting time: 0.082 minutes

predicting time: 3.217 minutes



In [18]:
from sklearn.model_selection import train_test_split
X_train_new, X_dev_new, y_train_new, y_dev_new = train_test_split(X_train, y_train, test_size=0.2, random_state=2017)

t_1 = time.time()
train = xgb.DMatrix(X_train_new, y_train_new)
val = xgb.DMatrix(X_dev_new, y_dev_new)
watchlist = [(val, 'eval')]
params = {
    'booster': 'gbtree',
    'eta': 0.08,
    'colsample_bytree': 0.85,
    'max_depth': 4,
    'seed': 2017,
    'n_estimators': 150,
    'silent': 0,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'verbose':1
}
clf = xgb.train(params, train, 500, watchlist, early_stopping_rounds=30,)
print('model training time:',round((time.time()-t_1)/60,3) ,'minutes\n')

t_1 = time.time()
test = xgb.DMatrix(X_test)
y_pred = clf.predict(test)
print('predicting time:',round((time.time()-t_1)/60,3) ,'minutes\n')
# public score:  0.457315110884   private score:  0.455979172162   average score:  0.456380697182

[0]	eval-rmse:1.80999
Will train until eval-rmse hasn't improved in 30 rounds.
[1]	eval-rmse:1.67561
[2]	eval-rmse:1.55267
[3]	eval-rmse:1.44034
[4]	eval-rmse:1.33776
[5]	eval-rmse:1.2442
[6]	eval-rmse:1.15913
[7]	eval-rmse:1.08177
[8]	eval-rmse:1.01167
[9]	eval-rmse:0.948214
[10]	eval-rmse:0.89086
[11]	eval-rmse:0.839272
[12]	eval-rmse:0.792874
[13]	eval-rmse:0.751282
[14]	eval-rmse:0.714165
[15]	eval-rmse:0.681208
[16]	eval-rmse:0.651888
[17]	eval-rmse:0.625956
[18]	eval-rmse:0.603132
[19]	eval-rmse:0.58307
[20]	eval-rmse:0.565428
[21]	eval-rmse:0.550046
[22]	eval-rmse:0.536632
[23]	eval-rmse:0.525042
[24]	eval-rmse:0.514978
[25]	eval-rmse:0.506236
[26]	eval-rmse:0.498624
[27]	eval-rmse:0.492033
[28]	eval-rmse:0.486397
[29]	eval-rmse:0.481466
[30]	eval-rmse:0.47728
[31]	eval-rmse:0.473747
[32]	eval-rmse:0.47065
[33]	eval-rmse:0.468013
[34]	eval-rmse:0.465639
[35]	eval-rmse:0.463748
[36]	eval-rmse:0.462106
[37]	eval-rmse:0.46066
[38]	eval-rmse:0.459385
[39]	eval-rmse:0.45831
[40]	eval

[332]	eval-rmse:0.44708
[333]	eval-rmse:0.44709
[334]	eval-rmse:0.447087
[335]	eval-rmse:0.447084
[336]	eval-rmse:0.447089
[337]	eval-rmse:0.447068
[338]	eval-rmse:0.447074
[339]	eval-rmse:0.447084
[340]	eval-rmse:0.447074
[341]	eval-rmse:0.447084
[342]	eval-rmse:0.447086
[343]	eval-rmse:0.447074
[344]	eval-rmse:0.447066
[345]	eval-rmse:0.44707
[346]	eval-rmse:0.447072
[347]	eval-rmse:0.447061
[348]	eval-rmse:0.44704
[349]	eval-rmse:0.447062
[350]	eval-rmse:0.447057
[351]	eval-rmse:0.447052
[352]	eval-rmse:0.447019
[353]	eval-rmse:0.447005
[354]	eval-rmse:0.447023
[355]	eval-rmse:0.447025
[356]	eval-rmse:0.447011
[357]	eval-rmse:0.446974
[358]	eval-rmse:0.446932
[359]	eval-rmse:0.446929
[360]	eval-rmse:0.446924
[361]	eval-rmse:0.446939
[362]	eval-rmse:0.446949
[363]	eval-rmse:0.446938
[364]	eval-rmse:0.446938
[365]	eval-rmse:0.446937
[366]	eval-rmse:0.446936
[367]	eval-rmse:0.446948
[368]	eval-rmse:0.446915
[369]	eval-rmse:0.446917
[370]	eval-rmse:0.446922
[371]	eval-rmse:0.446917
[372

In [381]:
t = time.time()
y_pred_xgb = clf_xgb.predict(X_test)
y_pred_pair_wise_xgb = clf_xgb.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.457000348812   private score:  0.455574277633   average score:  0.456002922519

t = time.time()
y_pred_xgbbagging = clf_xgbbagging.predict(X_test)
y_pred_pair_wise_xgbbagging = clf_xgbbagging.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.45676901447   private score:  0.455857536779   average score:  0.456131398342

predicting time: 0.013 minutes

predicting time: 0.566 minutes



### sklearn.svm

In [382]:
clf_lsvr = svm.LinearSVR(C = 0.5)
t = time.time()
clf_lsvr.fit(X_train, y_train)
print('model training time:',round((time.time()-t)/60,3) ,'minutes\n')
t = time.time()
y_pred_lsvr = clf_lsvr.predict(X_test)
y_pred_pair_wise_lsvr = clf_lsvr.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.880135243323   private score:  0.879463548456   average score:  0.87966527802

model training time: 0.599 minutes

predicting time: 0.001 minutes



In [383]:
t = time.time()
y_pred_svr = clf_svr.predict(X_test)
y_pred_pair_wise_svr = clf_svr.predict(X_test_pair_wise)
print('predicting time:',round((time.time()-t)/60,3) ,'minutes\n')
# public score:  0.540211492078   private score:  0.539543775977   average score:  0.539744343815

predicting time: 34.237 minutes



In [456]:
pred_test=(y_pred_GBbagging2+y_pred_ET+y_pred_xgb)/3
# public score:  0.455038807126   private score:  0.454203897013   average score:  0.454454739037
# public score:  0.455123742485   private score:  0.45415826103   average score:  0.454448361307
# public score:  0.454984401645   private score:  0.454098886705   average score:  0.454364942936
# public score:  0.454978499222   private score:  0.454089902646   average score:  0.454356885403

## <font color='green'>Setup 3</font>: Evaluation

In [465]:
p_pred = pd.DataFrame({"id": nn_pred.id, "pred_relevance": pred_test})

total = pd.merge(p_pred, df_test, on='id')

public_score = total[total.Usage == 'Public']
private_score = total[total.Usage == 'Private']

### RMSE

In [466]:
def RMSE(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_
print('public score: ',RMSE(list(public_score.pred_relevance), list(public_score.relevance)),\
      '  private score: ',RMSE(list(private_score.pred_relevance), list(private_score.relevance)),\
      '  average score: ',RMSE(list(total.pred_relevance),list(total.relevance)))

public score:  0.54115246183   private score:  0.541086472367   average score:  0.541106211185


In [467]:
k=2
bad_total = total[total.relevance <k]
bad_public_score = public_score[public_score.relevance <k]
bad_private_score = private_score[private_score.relevance <k]
good_total = total[total.relevance >=k]
good_public_score = public_score[public_score.relevance >=k]
good_private_score = private_score[private_score.relevance >=k]

In [468]:
print('<',k,' public score: ',RMSE(list(bad_public_score.pred_relevance), list(bad_public_score.relevance)),\
      ' <',k,' private score: ',RMSE(list(bad_private_score.pred_relevance), list(bad_private_score.relevance)),\
      ' <',k,' average score: ',RMSE(list(bad_total.pred_relevance),list(bad_total.relevance)))
print('>=',k, 'public score: ',RMSE(list(good_public_score.pred_relevance), list(good_public_score.relevance)),\
      '  >=',k, 'private score: ',RMSE(list(good_private_score.pred_relevance), list(good_private_score.relevance)),\
      '  >=',k,' average score: ',RMSE(list(good_total.pred_relevance),list(good_total.relevance)))

< 2  public score:  0.833083021596  < 2  private score:  0.830851321374  < 2  average score:  0.831527761715
>= 2 public score:  0.453768450562   >= 2 private score:  0.456564318096   >= 2  average score:  0.455732063848


### MAP@K

In [469]:
total_test = total.sort_values(['search_term_id','relevance'], ascending=[True, False])
total_pred = total.sort_values(['search_term_id','pred_relevance'], ascending=[True, False])

In [470]:
import ml_metrics as metrics #pip install ml_metrics

k1,k2 = 3,10

Y_p1 = total_test.groupby(by='search_term_id', sort=False).id.apply( lambda x: x.values[:k1].tolist()).values.tolist()
Y_p2 = total_test.groupby(by='search_term_id', sort=False).id.apply( lambda x: x.values[:k2].tolist()).values.tolist()
P_p = total_pred.groupby(by='search_term_id', sort=False).id.apply( lambda x: x.values.tolist()).values.tolist()

score1 = metrics.mapk(Y_p1, P_p,k1)
score2 = metrics.mapk(Y_p2, P_p,k2)

print("MAP@3: %.4f" % score1,"  MAP@10: %.4f" % score2)

MAP@3: 0.7063   MAP@10: 0.9706


### NDCG

In [471]:
df_test_pair_wise = df_test_pair_wise.sort_values('search_term_id')
test_group = [value for (key, value) in sorted(Counter(df_test_pair_wise['search_term_id']).items())]

In [472]:
from evaluation import *

eval_lst=eval_construct(y_pred_pair_wise_ABbagging,y_test_pair_wise,test_group)
eval_lst_random=eval_construct_random(y_test_pair_wise,test_group)
mean_NDCG(eval_lst),mean_NDCG(eval_lst_random)

(0.9519695523214009, 0.90002571539926024)

# Stacking(not used)