imports

In [2]:
import numpy as np
import pandas as pd
import re
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import xgboost
from xgboost import plot_importance

In [2]:
train_df=pd.read_csv('train_new.csv')
train_df.head(5)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,brand,product_description,rank
0,61851,116711,ge z wave 1800 watt resist cfl led indoor plug...,zwave switch,3.0,<UNK>,transform ani home into a smart home with the ...,1
1,122225,141628,leviton z wave control 3 way/remot scene capab...,zwave switch,3.0,leviton,the leviton dzmx1 is a z wave enabl univers di...,2
2,123081,142033,leviton decora z wave control 15 amp scene cap...,zwave switch,3.0,leviton,the leviton dzs15 is a z wave enabl univers sw...,3
3,109349,135547,ge z wave 600 watt cfl led indoor in wall dimm...,zwave switch,2.67,<UNK>,transform ani home into a smart home with the ...,4
4,143778,152640,z wave wireless light control with keypad control,zwave switch,2.67,<UNK>,transform ani home into a smart home with the ...,5


finding word counts and substrings to make features

In [3]:
def str_common_word(str1, str2):
    str1, str2 = str1.lower(), str2.lower()
    words, count = str1.split(), 0
    for word in words:
        if str2.find(word)>=0:
            count+=1
    return count
    
def str_whole_word(str1, str2, i_):
    str1, str2 = str1.lower().strip(), str2.lower().strip()
    count = 0
    while i_ < len(str2):
        i_ = str2.find(str1, i_)
        if i_ == -1:
            return count
        else:
            count += 1
            i_ += len(str1)
    return count

making features using counts, term frequency of search in title and description etc.

In [4]:
def features(df):
    df['word_len_of_search_term'] = df['search_term'].apply(lambda x:len(x.split())).astype(np.int64)
    df['word_len_of_title'] = df['product_title'].apply(lambda x:len(x.split())).astype(np.int64)
    df['word_len_of_description'] = df['product_description'].apply(lambda x:len(x.split())).astype(np.int64)
    df['word_len_of_brand'] = df['brand'].apply(lambda x:len(x.split())).astype(np.int64)
    
    # Create a new column that combine "search_term", "product_title" and "product_description"
    df['complete_product_desc'] = df['search_term']+"\t"+df['product_title'] +"\t"+df['product_description']
    
    # Number of times the entire search term appears in product title. 
    df['search_in_title'] = df['complete_product_desc'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[1],0))
    
    # Number of times the entire search term appears in product description
    df['search_in_description'] = df['complete_product_desc'].map(lambda x:str_whole_word(x.split('\t')[0],x.split('\t')[2],0))
    
    # Number of words that appear in search term also appear in product title.
    df['word_in_title'] = df['complete_product_desc'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
    
    # Number of words that appear in search term also appear in production description.
    df['word_in_description'] = df['complete_product_desc'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
    
    # The ratio of product title word length to search term word length
    df['query_title_len_prop']=df['word_len_of_title']/df['word_len_of_search_term']
    
    # The ratio of product description word length to search term word length
    df['query_desc_len_prop']=df['word_len_of_description']/df['word_len_of_search_term']
    
    # The ratio of product title and search term common word count to search term word count
    df['ratio_title'] = df['word_in_title']/df['word_len_of_search_term']
    
    # The ratio of product description and search term common word count to search term word count.
    df['ratio_description'] = df['word_in_description']/df['word_len_of_search_term']
    
    # new column that combine "search_term", "brand" and "product_title".
    df['attr'] = df['search_term']+"\t"+df['brand']+"\t"+df['product_title']
    
    # Number of words that appear in search term also apprears in brand.
    df['word_in_brand'] = df['attr'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
    
    # The ratio of search term and brand common word count to brand word count
    #df['ratio_brand'] = df['word_in_brand']/df['word_len_of_brand']
    # because of <unk>
    
    #just keep features
    df.drop(['id', 'product_uid', 'product_title', 'search_term', 'product_description',\
             'brand', 'complete_product_desc', 'attr'], axis=1, inplace=True)
    
    return df

In [5]:
feature_df=train_df.copy()


In [6]:
feature_df= features(feature_df)
feature_df.columns

Index(['relevance', 'rank', 'word_len_of_search_term', 'word_len_of_title',
       'word_len_of_description', 'word_len_of_brand', 'search_in_title',
       'search_in_description', 'word_in_title', 'word_in_description',
       'query_title_len_prop', 'query_desc_len_prop', 'ratio_title',
       'ratio_description', 'word_in_brand'],
      dtype='object')

In [7]:
feature_df.head(5)

Unnamed: 0,relevance,rank,word_len_of_search_term,word_len_of_title,word_len_of_description,word_len_of_brand,search_in_title,search_in_description,word_in_title,word_in_description,query_title_len_prop,query_desc_len_prop,ratio_title,ratio_description,word_in_brand
0,3.0,1,2,15,237,1,0,0,1,0,7.5,118.5,0.5,0.0,0
1,3.0,2,2,14,251,1,0,0,0,0,7.0,125.5,0.0,0.0,0
2,3.0,3,2,12,287,1,0,0,1,1,6.0,143.5,0.5,0.5,0
3,2.67,4,2,16,282,1,0,0,1,1,8.0,141.0,0.5,0.5,0
4,2.67,5,2,8,274,1,0,0,0,1,4.0,137.0,0.0,0.5,0


- testing by running model for rank and relevance separately.
- Also, using rank as a feature to boost relevance prediction.

In [13]:
feature_relev=feature_df.copy()
feature_rank=feature_df.copy()
feature_relev=feature_relev.drop(['rank'],axis=1)
feature_rank=feature_rank.drop(['relevance'],axis=1)

In [14]:
#need to work on relevance and rank as one, so using rank as a feature for now(can remove later) to test
X = feature_df.loc[:, feature_df.columns != 'relevance'] 
y = feature_df.loc[:, feature_df.columns == 'relevance']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [15]:
X_train

Unnamed: 0,rank,word_len_of_search_term,word_len_of_title,word_len_of_description,word_len_of_brand,search_in_title,search_in_description,word_in_title,word_in_description,query_title_len_prop,query_desc_len_prop,ratio_title,ratio_description,word_in_brand
34505,7,3,15,74,1,0,0,2,2,5.000000,24.666667,0.666667,0.666667,0
66440,3,4,14,117,1,0,0,2,3,3.500000,29.250000,0.500000,0.750000,0
17901,3,3,7,53,2,0,0,2,3,2.333333,17.666667,0.666667,1.000000,0
46209,6,2,9,111,1,0,0,1,1,4.500000,55.500000,0.500000,0.500000,0
71028,2,5,14,152,1,0,0,5,2,2.800000,30.400000,1.000000,0.400000,0
45039,9,4,17,174,1,0,0,2,2,4.250000,43.500000,0.500000,0.500000,0
10771,5,3,7,140,2,0,0,2,2,2.333333,46.666667,0.666667,0.666667,0
25446,5,3,5,49,1,0,0,2,3,1.666667,16.333333,0.666667,1.000000,0
13294,4,4,10,68,1,0,0,1,1,2.500000,17.000000,0.250000,0.250000,0
790,11,2,10,177,2,0,0,1,1,5.000000,88.500000,0.500000,0.500000,0


***Gradient boosting on relevance***

In [16]:
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls').fit(X_train, y_train.values.ravel())
y_pred = est.predict(X_test)
est_mse = mean_squared_error(y_pred, y_test)
est_rmse = np.sqrt(est_mse)
print('Gradient boosting RMSE: %.4f' % est_rmse)

Gradient boosting RMSE: 0.4036


***Gradient boosting by removing rank***

In [17]:
X_rel = feature_relev.loc[:, feature_relev.columns != 'relevance'] 
y_rel = feature_relev.loc[:, feature_relev.columns == 'relevance']
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_rel, y_rel, test_size=0.3, random_state=0)

est.fit(X_train_r, y_train_r.values.ravel())
y_pred_r = est.predict(X_test_r)
est_mse2 = mean_squared_error(y_pred_r, y_test_r)
est_rmse2 = np.sqrt(est_mse2)
print('Gradient boosting RMSE: %.4f' % est_rmse2)

Gradient boosting RMSE: 0.4848


***Gradient boosting (rank)***

In [18]:
X_rank = feature_rank.loc[:, feature_rank.columns != 'rank'] 
y_rank = feature_rank.loc[:, feature_rank.columns == 'rank']
X_train_rk, X_test_rk, y_train_rk, y_test_rk = train_test_split(X_rank, y_rank, test_size=0.3, random_state=0)

est.fit(X_train_rk, y_train_rk.values.ravel())
y_pred_rk = est.predict(X_test_rk)
est_mse_rank = mean_squared_error(y_pred_rk, y_test_rk)
est_rmse_rank = np.sqrt(est_mse_rank)
print('Gradient boosting RMSE: %.4f' % est_rmse_rank)

Gradient boosting RMSE: 2.8686


In [19]:
y_pred

array([2.78213166, 2.33392544, 2.43229991, ..., 2.42526146, 2.53173481,
       2.23408954])

In [20]:
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7)
xgb.fit(X_train, y_train.values.ravel())
y_pred_x = xgb.predict(X_test)
xgb_mse = mean_squared_error(y_pred_x, y_test)
xgb_rmse = np.sqrt(xgb_mse)
print('Xgboost RMSE: %.4f' % xgb_rmse)


Xgboost RMSE: 0.3930


In [103]:
xgb.save_model('xgb_1.model')

***test data***

In [89]:
test_df=pd.read_csv('data/test_new.csv')
test_df.columns

Index(['Unnamed: 0', '_unit_id', 'relevance', 'relevance:variance',
       'product_image', 'product_link', 'product_price', 'product_title',
       'query', 'rank', 'source', 'url', 'product_description', 'brand'],
      dtype='object')

In [90]:
predicted_df=test_df.copy()

In [91]:
predicted_df=predicted_df[['product_title','query','product_description','brand']]

In [78]:
test_df=test_df[['Unnamed: 0', '_unit_id','relevance','product_title','query','rank','product_description','brand']]
test_df.rename(columns={'Unnamed: 0':'id', '_unit_id':'product_uid','query':'search_term'}, inplace=True)
test_df['product_description'] = test_df['product_description'].replace(np.nan, '', regex=True)
test_df['relevance'] = test_df['relevance'].replace(np.nan, 0, regex=True)
test_df.head(5)

Unnamed: 0,id,product_uid,relevance,product_title,search_term,rank,product_description,brand
0,0,711158459,3.67,Sony PlayStation 4 (PS4) (Latest Model)- 500 G...,playstation 4,1,The PlayStation 4 system opens the door to an ...,Sony
1,1,711158460,4.0,Sony PlayStation 4 (Latest Model)- 500 GB Jet ...,playstation 4,2,The PlayStation 4 system opens the door to an ...,Sony
2,2,711158461,4.0,Sony PlayStation 4 PS4 500 GB Jet Black Console,playstation 4,3,The PlayStation 4 system opens the door to an ...,Sony
3,3,711158462,3.67,Sony - PlayStation 4 500GB The Last of Us Rema...,playstation 4,4,,Sony
4,4,711158463,3.33,Sony PlayStation 4 (PS4) (Latest Model)- 500 G...,playstation 4,5,The PlayStation 4 system opens the door to an ...,Sony


In [79]:
test_features=features(test_df)
test_features.columns

Index(['relevance', 'rank', 'word_len_of_search_term', 'word_len_of_title',
       'word_len_of_description', 'word_len_of_brand', 'search_in_title',
       'search_in_description', 'word_in_title', 'word_in_description',
       'query_title_len_prop', 'query_desc_len_prop', 'ratio_title',
       'ratio_description', 'word_in_brand'],
      dtype='object')

In [80]:
test_X = test_features.loc[:, test_features.columns != 'relevance'] 
test_y = test_features.loc[:, test_features.columns == 'relevance']


In [81]:
test_pred = xgb.predict(test_X)
test_xgb_mse = mean_squared_error(test_pred, test_y)
test_xgb_rmse = np.sqrt(xgb_mse)
print('Xgboost RMSE: %.4f' % test_xgb_rmse)

Xgboost RMSE: 0.3930


In [93]:
predicted_df['predicted_relevance']=test_pred
predicted_df.head(5)

Unnamed: 0,product_title,query,product_description,brand,predicted_relevance
0,Sony PlayStation 4 (PS4) (Latest Model)- 500 G...,playstation 4,The PlayStation 4 system opens the door to an ...,Sony,2.932604
1,Sony PlayStation 4 (Latest Model)- 500 GB Jet ...,playstation 4,The PlayStation 4 system opens the door to an ...,Sony,2.889153
2,Sony PlayStation 4 PS4 500 GB Jet Black Console,playstation 4,The PlayStation 4 system opens the door to an ...,Sony,2.814829
3,Sony - PlayStation 4 500GB The Last of Us Rema...,playstation 4,,Sony,2.36468
4,Sony PlayStation 4 (PS4) (Latest Model)- 500 G...,playstation 4,The PlayStation 4 system opens the door to an ...,Sony,2.695257


In [96]:
predicted_df=predicted_df.sort_values('predicted_relevance', ascending=False).drop_duplicates(['query','predicted_relevance'])

In [97]:
predicted_df

Unnamed: 0,product_title,query,product_description,brand,predicted_relevance
12366,Better Homes and Gardens Metal Bicycle Table C...,table clock,The Better Homes and Gardens Metal Bicycle Tab...,<UNK>,3.076689
5562,VIGO Stainless-Steel Pull-Out Spray Single-Han...,kitchen faucet,This Vigo kitchen faucet showcases a sleek sta...,VIGO,3.061520
9405,Barbie Princess Power Super Sparkle Doll,barbie,"details\nIn Barbie in Princess Power movie, a ...",<UNK>,3.050361
6279,Pyrex Smart Essentials 8-piece Bowl Set,pyrex,This eight-piece set of clear glass bowls from...,Pyrex,3.042910
2654,MLB Pittsburgh Pirates Snuggle Bear,pittsburgh pirates,This Pittsburgh Pirates Snuggle Bear features ...,<UNK>,3.031968
1707,Speck CandyShell Grip Case for iPhone 5 & 5s P...,speck iphone 5 case,,<UNK>,3.030064
10234,Cone Shape Aroma Diffuser,aroma diffuser,This new design aroma diffuser has water rated...,<UNK>,3.029851
4839,Gray Earth Men's Belted Cargo Shorts,cargo shorts,These men's belted cargo shorts by Gray Earth ...,<UNK>,3.027064
6880,Pittsburgh Pirates Water Bottle - Black (26 oz.),pittsburgh pirates,Quench your thirst and support your favorite b...,<UNK>,3.022171
12028,Nikon COOLPIX L29 16MP Digital Camera with Mem...,digital camera,Bundle and save. Get more and spend less when ...,<UNK>,3.018920


In [98]:
xgb.save_model('xgb_train.model')

In [105]:
xgb_est=xgboost.Booster({'nthread': 4})
xgb_est.load_model('/Users/shray/Desktop/nlp/nlpclass-1197-g-nlpcorps/project/xgb_train.model')



After loading the model with XGBoost or sklearn you have to convert the data to DMatrix
in order to predict using the loaded model !

In [109]:
dtest = xgboost.DMatrix(test_X)

pred_4=xgb_est.predict(dtest)
dtest

<xgboost.core.DMatrix at 0x1a2784e7f0>

Since there is no way to directly convert Dmatrix to numpy array/list etc. use pickle

In [110]:
import pickle
pickle.dump(xgb, open("pima.pickle.dat", "wb"))

In [111]:
# load model from file
loaded_model = pickle.load(open("pima.pickle.dat", "rb"))
# make predictions for test data
pred_5 = loaded_model.predict(test_X)
pred_5



array([2.9326043, 2.889153 , 2.8148293, ..., 1.775797 , 1.7656626,
       1.9540547], dtype=float32)