# Kaggle:Home Depot Product Search Relevance

## step1 导入数据

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer

In [2]:
df_train = pd.read_csv('train.csv', encoding='ISO-8859-1')
df_test = pd.read_csv('test.csv', encoding='ISO-8859-1')
df_desc = pd.read_csv('product_descriptions.csv')

In [3]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_desc, how='left', on='product_uid')

## step2 文本预处理

对原始文本信息进行三点预处理

（1）将单词小写并分为tokens

（2）去除停止词和数字

（3）使用SnowballStemmer 提取词干

In [4]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
import re
def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))
def check(word):
    if word in stop:
        return False
    elif hasNumbers(word):
        return False
    else:
        return True

stemmer = SnowballStemmer('english')
#def str_stemmer(s):
#    return " ".join([stemmer.stem(word) for word in s.lower().split()])
df_all['search_term'].str.lower().str.replace('"','').str.replace("'", '').str.split()
df_all['product_title'].str.lower().str.replace('"','').str.replace("'", '').str.split()
df_all['product_description'].str.lower().str.replace('"','').str.replace("'", '').str.split()

df_all['search_term'] = df_all['search_term'].map(lambda x:stemmer.stem(item) for item in x if check(item))
df_all['product_title'] = df_all['product_title'].map(lambda x:stemmer.stem(item) for item in x if check(item))
df_all['product_description'] = df_all['product_description'].map(
                                lambda x:stemmer.stem(item) for item in x if check(item))

## step3 文本特征

提取三项特征

（1）文本距离

（2）基于TF-IDF算法的文本余弦相似度

（3）基于Word2Ver的向量余弦相似度

（1）新建两列'dist_in_title'和'dist_in_desc'

是search_term与product_title和product_description的文本距离

In [5]:
import Levenshtein
df_all['dist_in_title'] = df_all.apply(lambda x:Levenshtein.ratio(x['search_term'],x['product_title']), axis=1)
df_all['dist_in_desc'] = df_all.apply(lambda x:Levenshtein.ratio(x['search_term'],x['product_description']), axis=1)

（2）利用TF-IDF计算文本余弦相似度

新建两列'tfidf_cos_sim_in_title'和'tfidf_cos_sim_in_desc'

为'search_term'与'product_title'和'product_description'的余弦相似度

In [6]:
df_all['all_texts'] = df_all['product_title'] + '.' + df_all['product_description'] + '.'

In [8]:
from gensim.utils import tokenize
from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary(list(tokenize(x, errors='ignore')) for x in df_all['all_texts'].values)
class MyCorpus(object):
    def __iter__(self):
        for x in df_all['all_texts'].values:
            yield dictionary.doc2bow(list(tokenize(x, errors='ignore')))
corpus = MyCorpus()

In [9]:
from gensim.models.tfidfmodel import TfidfModel
tfidf = TfidfModel(corpus)

In [10]:
from gensim.similarities import MatrixSimilarity
def to_tfidf(text):
    res = tfidf[dictionary.doc2bow(list(tokenize(text, errors='ignore')))]
    return res
def cos_sim(text1, text2):
    tfidf1 = to_tfidf(text1)
    tfidf2 = to_tfidf(text2)
    index = MatrixSimilarity([tfidf1],num_features=len(dictionary))
    sim = index[tfidf2]
   
    return float(sim[0])

In [11]:
df_all['tfidf_cos_sim_in_title'] = df_all.apply(lambda x: cos_sim(x['search_term'], x['product_title']), axis=1)
df_all['tfidf_cos_sim_in_desc'] = df_all.apply(lambda x: cos_sim(x['search_term'], x['product_description']), axis=1)

（3）利用Word2Vec计算单词向量的余弦相似度

新建两列w2v_cos_sim_in_title和w2v_cos_sim_in_desc

为search_term与product_title和product_description所有单词向量的余弦相似度的均值

In [12]:
import nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer.tokenize(df_all['all_texts'].values[0])
sentences = [tokenizer.tokenize(x) for x in df_all['all_texts'].values]
sentences = [y for x in sentences for y in x]

In [13]:
from nltk.tokenize import word_tokenize
w2v_corpus = [word_tokenize(x) for x in sentences]

In [14]:
from gensim.models.word2vec import Word2Vec
model = Word2Vec(w2v_corpus, size=128, window=5, min_count=5, workers=4)

In [15]:
vocab = model.wv.vocab
def get_vector(text):
    res = np.zeros([128])
    count = 0
    for word in word_tokenize(text):
        if word in vocab:
            res += model[word]
            count +=1
    return res/count

In [16]:
from scipy import spatial
def w2v_cos_sim(text1, text2):
    try:
        w2v1 = get_vector(text1)
        w2v2 = get_vector(text2)
        sim = 1 - spatial.distance.cosine(w2v1, w2v2)
        return float(sim)
    except:
        return float(0)


In [17]:
df_all['w2v_cos_sim_in_title'] = df_all.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_title']), axis=1)
df_all['w2v_cos_sim_in_desc'] = df_all.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_description']), axis=1)

  if __name__ == '__main__':


In [19]:
df_all = df_all.drop(['search_term','product_title','product_description','all_texts'],axis=1)

In [33]:
df_all.head(5)

Unnamed: 0,id,product_uid,relevance,dist_in_title,dist_in_desc,tfidf_cos_sim_in_title,tfidf_cos_sim_in_desc,w2v_cos_sim_in_title,w2v_cos_sim_in_desc
0,2,100001,3.0,0.190476,0.030418,0.274539,0.182836,0.465428,0.459099
1,3,100001,2.5,0.153846,0.022901,0.0,0.0,0.337834,0.147382
2,9,100002,3.0,0.175,0.017875,0.0,0.053455,0.347624,0.468898
3,16,100005,2.33,0.326087,0.048632,0.133577,0.043712,0.558218,0.489915
4,17,100005,2.67,0.382979,0.054545,0.39732,0.098485,0.727232,0.489721


## step4 分割训练、测试集

In [20]:
df_train = df_all.loc[df_train.index]
df_test = df_all.loc[df_test.index]
test_ids = df_test['id']
y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values

In [28]:
np.savetxt('X_train_V1.out',X_train)
np.savetxt('y_train_V1.out',y_train)
np.savetxt('X_test_V1.out',X_test)

## step5 建立模型

选用RandomForestRegressor，XGBRegressor，Ridge三种模型，

超参数使用GridSearch方法调整

In [29]:
from sklearn.ensemble import RandomForestRegressor  
from xgboost import XGBRegressor 
from sklearn.linear_model import Ridge
clfs = [RandomForestRegressor(n_estimators=15, max_depth=6),  
        XGBRegressor(max_depth=10, learning_rate=0.3, n_estimators=150, silent=True, 
                    objective='reg:linear', nthread=-1, gamma=0, eval_metric='rmse',
                    max_delta_step=0, subsample=0.85, colsample_bytree=0.7, 
                    colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, 
                    seed=0, missing=None),  
        Ridge(alpha=12)]

## step6 模型融合并输出结果

采用stacking方式模型融合

In [30]:
dataset_stack_train = np.zeros((X_train.shape[0],len(clfs)))  
dataset_stack_test = np.zeros((X_test.shape[0],len(clfs)))  
for j,clf in enumerate(clfs):  
    clf.fit(X_train,y_train)  
    stack_y_submission = clf.predict(X_test)  
    stack_y_train = clf.predict(X_train)  
    dataset_stack_train[:,j] = stack_y_train  
    dataset_stack_test[:,j] = stack_y_submission

In [31]:
clf = RandomForestRegressor(n_estimators=30,max_depth=6)  
clf.fit(dataset_stack_train,y_train)  
predict = clf.predict(dataset_stack_test)  
pd.DataFrame({"id": test_ids, "relevance": predict}).to_csv('submission.csv',index=False)