In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [None]:
#import the data
att=pd.read_csv('../input/home-depot-product-search-relevance/attributes.csv.zip')
desc=pd.read_csv('../input/home-depot-product-search-relevance/product_descriptions.csv.zip')
train=pd.read_csv('../input/home-depot-product-search-relevance/train.csv.zip', encoding='latin-1')
test=pd.read_csv('../input/home-depot-product-search-relevance/test.csv.zip',encoding='latin-1')

In [None]:
#clean attribute file by casting id and concatenating the name and value columns
att = att.fillna(0)
att['product_uid'] = att['product_uid'].astype(np.int64)
att['string']=att['name'].map(str)+' '+att['value'].map(str)
att

In [None]:
#pivot attribute table to have one long string of attributes for each product_id
att1=pd.pivot_table(att,index=['product_uid'],values=['string'],aggfunc=lambda x: ' '.join(x))
att1

In [None]:
#merge datasets to create the raw training set
train=pd.merge(train, desc, how='left', on='product_uid')
train=pd.merge(train, att1, how='left', on='product_uid')
train

In [None]:
#feature engineer
#find overlap of search term words in description and attributes
train['string'] = train['string'].replace(np.nan, '', regex=True)
train['term_desc'] = [len(set(a.split()) & set(b.split())) for a, b in zip(train.search_term, train.product_description)]
train['term_att'] = [len(set(a.split()) & set(b.split())) for a, b in zip(train.search_term, train.string)]
train['term_prod'] = [len(set(a.split()) & set(b.split())) for a, b in zip(train.search_term, train.product_title)]
#calculate ratios here
train['sum']=train['term_desc']+train['term_att']
train['q_length']=[len(a.split()) for a in train['search_term']]
train['ratio1']=train['term_prod']/train['q_length']
train['ratio2']=train['sum']/train['q_length']
train.sort_values('relevance')

In [None]:
#replicate process for test set
test=pd.merge(test, desc, how='left', on='product_uid')
test=pd.merge(test, att1, how='left', on='product_uid')

test['string'] = test['string'].replace(np.nan, '', regex=True)
test['term_desc'] = [len(set(a.split()) & set(b.split())) for a, b in zip(test.search_term, test.product_description)]
test['term_att'] = [len(set(a.split()) & set(b.split())) for a, b in zip(test.search_term, test.string)]
test['term_prod'] = [len(set(a.split()) & set(b.split())) for a, b in zip(test.search_term, test.product_title)]
test['sum']=test['term_desc']+test['term_att']
test['q_length']=[len(a.split()) for a in test['search_term']]
test['ratio1']=test['term_prod']/test['q_length']
test['ratio2']=test['sum']/test['q_length']



In [None]:
#trim dataframes to just the usable features
ready=train[['relevance','term_desc','term_att','sum','term_prod','q_length','ratio1','ratio2']]
test1=test[['id','term_desc','term_att','sum','term_prod','q_length','ratio1','ratio2']]


In [None]:
#test/train split and build Ridge regression model on training set
X = ready.loc[:, ready.columns != 'relevance']
y = ready.loc[:, ready.columns == 'relevance']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

rg= Ridge(alpha=.1)
rg.fit(X_train, y_train.values.ravel())
y_pred = rg.predict(X_test)
rg_mse = mean_squared_error(y_pred, y_test)
rg_rmse = np.sqrt(rg_mse)
print('Ridge RMSE: %.4f' % rg_rmse)

In [None]:
#generate predictions on test set
Xt = test1.loc[:, test1.columns != 'id']
y_pred = rg.predict(Xt)
y_pred

In [None]:
#save and export results
results=pd.DataFrame()
results['id']=test1['id']
results['relevance']=y_pred
results.to_csv("home_depot.csv", index=False)