In [None]:
#script by Rob Treichler...basic python script 

import warnings; warnings.filterwarnings("ignore");

#create working directory
import os#Miscellaneous operating system interfaces
os.chdir(r'C:\\Users\\rtreichl\\Documents\\competitions\\home_depot')  #working directory

#%matplotlib inline allows graphics to show below each cell (or graphics in line)
# for some reason, %matplotlib inline won't work if comments are made in the same cell

In [None]:
%matplotlib inline

In [None]:
import pandas as pd #munging and wrangling
import numpy as np  #for arrays, etc.
import matplotlib.pyplot as plt #graphs/plots

#preprocessing
from sklearn import preprocessing

#Model Selection -> Metrics
from sklearn.metrics import mean_squared_error
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV

#xgboost
import xgboost as xgb #xgboost
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor

from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

In [None]:
#use dataframes for EDA
train = pd.read_csv('train.csv',encoding='cp437')
test  = pd.read_csv('test.csv',encoding='cp437')
product_descriptions  = pd.read_csv('product_descriptions.csv',encoding='cp437')
attributes  = pd.read_csv('attributes.csv',encoding='cp437')

In [None]:
#quick view of train
train.head()

In [None]:
#shape of train
train.shape

In [None]:
#quick view of test
test.head()

In [None]:
#shape of test
test.shape

In [None]:
#quick view of product descriptions
product_descriptions.head()

In [None]:
#quick view of attributes
attributes.head()

In [None]:
#size of attributes
attributes.shape

In [None]:
#delete rows with null values
attributes=attributes.dropna()
print(attributes.shape)

In [None]:
#length of train
num_train = len(train)

In [None]:
#stem function
def str_stemmer(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

In [None]:
#count words
def str_common_word(str1, str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())

In [None]:
#combine train and test sets
df_all = pd.concat((train, test), axis=0, ignore_index=True)

In [None]:
#add the product descriptions
df_all = pd.merge(df_all, product_descriptions, how='left', on='product_uid')

In [None]:
#name_list=list(attributes.groupby('name').size().index)
#for i in name_list:
#    df_temp = attributes[attributes.name == i][["product_uid", "value"]]
#    df_all = pd.merge(df_all, df_temp, how='left', on='product_uid')
#    df_all[i] = df_all[i].replace(np.nan,'0', regex=True)
#    print(len(df_all.columns))
#    del df_temp

In [None]:
#add the brand name
df_brand = attributes[attributes.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})
df_all = pd.merge(df_all, df_brand, how='left', on='product_uid')
#where a brand name is abscent, set string value of 0
df_all['brand'] = df_all['brand'].replace(np.nan,'0', regex=True)

In [None]:
#remove brand name from attributes
attributes=attributes[attributes.name != "MFG Brand Name"]
print(attributes.shape)

In [None]:
#create string of all other attributes
#df_attributes=attributes.groupby(['product_uid'])['value'].apply(lambda x: str(x)).reset_index()
#df_all = pd.merge(df_all, df_attributes, how='left', on='product_uid')
#df_all['value'] = df_all['value'].replace(np.nan,'0', regex=True)

In [None]:
#search term -> lower case and stem 
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))

In [None]:
#product title -> lower case and stem
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))

In [None]:
#product description -> lower case and stem
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

In [None]:
#attribute values -> lower case and stem
#df_all['value'] = df_all['value'].map(lambda x:str_stemmer(x))

In [None]:
#for common word function
df_all['product_info'] = df_all['search_term']+"\t"+df_all['brand']+"\t"+df_all['product_title']+"\t"+\
                        df_all['product_description']#+"\t"+df_all['value']

In [None]:
#perform a word count....identify stuff like spaces, etc.

#stop words?

In [None]:
#number of words in search term
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_title'] = df_all['product_title'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64)


In [None]:
#max number of words in brand name matching search term.
df_all['word_in_brand'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))

In [None]:
#max number of words in title matching search term.
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))

In [None]:
#max number of words in description matching search term
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[3]))

In [None]:
#max number of words in attribute value matching search term
#df_all['word_in_value'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[4]))

In [None]:
#additional variable
df_all['ratio_title'] = df_all['word_in_title']/df_all['len_of_query']
df_all['ratio_description'] = df_all['word_in_description']/df_all['len_of_query']
df_all['ratio_brand'] = df_all['word_in_brand']/df_all['len_of_brand']
#df_all['ratio_value'] = df_all['word_in_value']/df_all['len_of_brand']

In [None]:
#def getNgrams(input, n):
#  input = input.split(' ')
#  output = []
#  for i in range(len(input)-n+1):
#    output.append(input[i:i+n])
#  return output

In [None]:
#content = str(df_all['search_term'])
#ngrams = getNgrams(content, 1)
#print(ngrams)

In [None]:
#drop those variables no longer needed
df_all = df_all.drop(['search_term','product_title','product_description','product_info','brand',
                     # 'value'
                     ],axis=1)

In [None]:
#define train and test set from df_all
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

In [None]:
y_train = df_train['relevance'].values
x_train = df_train.drop(['id','relevance','product_uid'],axis=1).values
x_test = df_test.drop(['id','relevance','product_uid'],axis=1).values

In [None]:
#training and test sets for cross validation
x_traincv, x_testcv, y_traincv, y_testcv = cross_validation.train_test_split(x_train,y_train, test_size=0.30, random_state=0)

In [None]:
#As a starting point, we will looks for a potential set of optimal parameters to train and test with

#function to 
def rf_params(xt,yt):
    param_grid ={    "max_depth": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,
                            33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48],
              "min_samples_split" : [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],
             "min_samples_leaf": [1,2],
            }

    rf=RandomForestRegressor()

    grid=GridSearchCV(estimator=rf, param_grid=param_grid,scoring="mean_squared_error", n_jobs=-1, cv=5,)

    grid_train = grid.fit(xt,yt)

    print(grid.best_params_)

rf_params(xt=x_testcv,yt=y_testcv)

In [None]:
rf = RandomForestRegressor(n_estimators=1000, max_depth= 7, min_samples_split= 29, min_samples_leaf= 2,
                                                 n_jobs = -1)

#{'max_depth': 7, 'min_samples_split': 29, 'min_samples_leaf': 2}
#{'min_samples_leaf': 1, 'min_samples_split': 22, 'max_depth': 8}

#rfr = RandomForestRegressor(n_estimators = 500, n_jobs = -1, random_state = 2016, verbose = 1)


#rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
#clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)

clf = BaggingRegressor(rf, #n_estimators=45, 
                       max_samples=0.1, random_state=25)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission_value4.csv',index=False)

In [None]:
#identify appropriate number of boost rounds

def xg_paraml(label,x_train,y_train,x_test,y_test):

    for bb in range(600,800,10):
        
        xgb_train_data = xgb.DMatrix(x_train, np.array(y_train))
        xgb_test_data=xgb.DMatrix(x_test)
        params={'objective':'reg:linear','eval_metric' : 'rmse','eta': 0.01 ,
                        'subsample':1.0, 'nthread' : -1, 
                #'max_depth':6, 'colsample_bytree':0.3
               }
        xgb_estimator = xgb.train(params, xgb_train_data, num_boost_round= bb)
        pred=xgb_estimator.predict(xgb_train_data)
        pred_test=xgb_estimator.predict(xgb_test_data)
        rmse_tr=mean_squared_error(y_train, pred)**.5
        rmse_te=mean_squared_error(y_test, pred_test)**.5
        print(label+ str(rmse_tr) + '   '+
              str(rmse_te) +
              ' boost rounds '+str(bb))

xg_paraml(label=' parameter ', x_train=x_traincv,y_train=y_traincv,x_test=x_testcv,y_test=y_testcv)

In [None]:
#identify optimal parameters

def xg_paraml(label,x_train,y_train,x_test,y_test):
    cb_list=[1.0]
    
    for md in range(4,10,1):
        for cb in cb_list:
                    xgb_train_data = xgb.DMatrix(x_train, np.array(y_train ))
                    xgb_test_data=xgb.DMatrix(x_test)
                    params={'objective':'reg:linear','eval_metric' : 'rmse','eta': 0.01 ,
                        'subsample':1.0, 'max_depth':md, 'colsample_bytree':cb,'nthreads':-1}
                    xgb_estimator = xgb.train(params, xgb_train_data, num_boost_round= 650)
                    pred=xgb_estimator.predict(xgb_train_data)
                    pred_test=xgb_estimator.predict(xgb_test_data)
                    rmse_tr=mean_squared_error(y_train, pred)**.5
                    rmse_te=mean_squared_error(y_test, pred_test)**.5
                    print(label+ str(rmse_tr) + '   '+str(rmse_te) +#' subsample '+str(ss)+
                          ' max_depth '+
                      str(md)+' colsample '+str(cb))


xg_paraml(label=' parameter ', x_train=x_traincv,y_train=y_traincv,x_test=x_testcv,y_test=y_testcv)


In [None]:
#final model with submission file
#def xg_sub(label,x_train,y_train,x_test,parameters):
    #xgb parameters
    
params = {'objective':'reg:linear','eval_metric' : 'rmse', 'eta': 0.01,
         'max_depth': 6, 'colsample_bytree': 1.0, 'subsample': 1.0,
          'nthread':-1}

xgb_train_data = xgb.DMatrix(x_train, y_train)
xgb_test_data  = xgb.DMatrix(x_test)

xgb_estimator = xgb.train(params, xgb_train_data, num_boost_round= 650)


train_pred=pd.DataFrame(xgb_estimator.predict(xgb_train_data))
test_pred =xgb_estimator.predict(xgb_test_data)

pd.DataFrame({"id": id_test, "relevance": test_pred}).to_csv('submission_xg_value4.csv',index=False)
