In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle
import zipfile
from datetime import datetime
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost
from scipy.sparse import hstack
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge,Lasso,ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SAPEKSHA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_data = zipfile.ZipFile('G:/Applied_AI/case_study_1/train.csv.zip')
train_data = pd.read_csv(train_data.open('train.csv'),encoding = "ISO-8859-1")
print('train_data',train_data.shape)
attribute_data = zipfile.ZipFile('G:/Applied_AI/case_study_1/attributes.csv.zip')
attribute_data = pd.read_csv(attribute_data.open('attributes.csv'),encoding = "ISO-8859-1")
print('Attribute_data',attribute_data.shape)
description_data = zipfile.ZipFile('G:/Applied_AI/case_study_1/product_descriptions.csv.zip')
description_data = pd.read_csv(description_data.open('product_descriptions.csv'),encoding = "ISO-8859-1")
print('description_data',description_data.shape)
test_data = zipfile.ZipFile('G:/Applied_AI/case_study_1/test.csv.zip')
test_data = pd.read_csv(test_data.open('test.csv'),encoding = "ISO-8859-1")
print('test_data',test_data.shape)

train_data (74067, 5)
Attribute_data (2044803, 3)
description_data (124428, 2)
test_data (166693, 4)


In [3]:
def merge_attributes(df):
    attr = attribute_data.copy()
    product_uid = df['product_uid'].values
    
    temp = attr.loc[attr['product_uid'].isin(product_uid)] 
    temp['combine_feature'] = temp['name'] + ' ' + temp['value']
    
    brands = temp[temp['name']=='MFG Brand Name']
    brands['brand'] = brands['value']
    brands.drop(['name','value','combine_feature'],axis=1,inplace=True)

    temp= temp.merge(brands,on='product_uid',how='left')
    temp['combine_feature_'] = temp.groupby('product_uid')['combine_feature'].transform(lambda x :''.join(str(x)))
    temp = temp.drop_duplicates(subset=['product_uid'])
    df = df.merge(temp,on='product_uid',how='left').set_index(df.index)
    df.drop(['name','value','combine_feature'],axis=1,inplace=True)
    return df



def merge_description(df):
    descrip = description_data.copy()
    product_uid = df['product_uid'].values
    temp = descrip.loc[descrip['product_uid'].isin(product_uid)]
    df = df.merge(temp,on='product_uid',how='left').set_index(df.index)
    return df


def extract_n_words(n,text):
    if n>len(text.split()):
        return 'invalid'
    return ' '.join(text.split()[:n])

def fill_brand(df):
    null_brand_values = df[df['brand'].isna()]
    unique_brands = df['brand'].unique()

    for i,j in null_brand_values.iterrows():
        title=j['product_title']
        if extract_n_words(6,title) in unique_brands:
            null_brand_values['brand'].loc[i] = extract_n_words(6, title)
        elif extract_n_words(5,title) in unique_brands:
            null_brand_values['brand'].loc[i] = extract_n_words(5, title)
        elif extract_n_words(4,title) in unique_brands:
            null_brand_values['brand'].loc[i] = extract_n_words(4, title)
        elif extract_n_words(3,title) in unique_brands:
            null_brand_values['brand'].loc[i] = extract_n_words(3, title)
        elif extract_n_words(2,title) in unique_brands:
            null_brand_values['brand'].loc[i] = extract_n_words(2, title)
        else:
            null_brand_values['brand'].loc[i] = extract_n_words(1, title)
            
    df['brand'].loc[null_brand_values.index]=null_brand_values['brand'].values
    return df

def fill_attributes(df):
    null_df = df[df['combine_feature_'].isna()]
    null_df['combine_feature_'] = null_df['product_description'].copy()
    df['combine_feature_'].loc[null_df.index] = null_df['combine_feature_'].values
    return df



#Reference : https://towardsdatascience.com/modeling-product-search-relevance-in-e-commerce-home-depot-case-study-8ccb56fbc5ab

def standardize_units(text):
    text = " " + text + " "
    text = re.sub('( gal | gals | galon )',' gallon ',text)
    text = re.sub('( ft | fts | feets | foot | foots )',' feet ',text)
    text = re.sub('( squares | sq )',' square ',text)
    text = re.sub('( lb | lbs | pounds )',' pound ',text)
    text = re.sub('( oz | ozs | ounces | ounc )',' ounce ',text)
    text = re.sub('( yds | yd | yards )',' yard ',text)
    return text

def preprocessing(text):
    
    text = text.replace('in.','inch')  # Replace in. with inch
    text = re.sub('[^A-Za-z0-9.]+',' ',text) # remove special characters except '.'
    text = re.sub(r"(?<!\d)[.,;:](?!\d)",'',text,0) # https://stackoverflow.com/questions/43142710/remove-all-punctuation-from-string-except-if-its-between-digits
    text = re.sub("[A-Za-z]+", lambda ele: " " + ele[0] + " ", text)
    text = standardize_units(text)
    text = text.lower()
    text = ' '.join(text.split())
    return text

stop_words = stopwords.words('english')
ps = PorterStemmer()

def stopwords_stemming(text):
    words = text.split()
    words = [w for w in words if w not in stop_words] # Stopwords
    words = [ps.stem(word) for word in words] # stemming
    return ' '.join(words)

def stemming_search(text):
    words = text.split()
    words = [ps.stem(word) for word in words] # stemming
    return ' '.join(words)

In [4]:
train_set = train_data.copy()
test_set = test_data.copy()

In [5]:
train_set = merge_attributes(train_set)
train_set= merge_description(train_set)
print(train_set.shape)
train_set.head()

(74067, 8)


Unnamed: 0,id,product_uid,product_title,search_term,relevance,brand,combine_feature_,product_description
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DeckOver,"15 Application Method Brush,Rol...",BEHR Premium Textured DECKOVER is an innovativ...
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Delta,50 Bath Faucet Type Combo Tub a...,Update your bathroom with the Delta Vero Singl...
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Delta,50 Bath Faucet Type Combo Tub a...,Update your bathroom with the Delta Vero Singl...


In [6]:
test_set = merge_attributes(test_set)
test_set= merge_description(test_set)
print(test_set.shape)
test_set.head()

(166693, 7)


Unnamed: 0,id,product_uid,product_title,search_term,brand,combine_feature_,product_description
0,1,100001,Simpson Strong-Tie 12-Gauge Angle,90 degree bracket,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."


In [7]:
merge_data= pd.concat([train_set,test_set],axis=0).reset_index()

In [8]:
print(merge_data.shape)
merge_data.head()

(240760, 9)


Unnamed: 0,index,id,product_uid,product_title,search_term,relevance,brand,combine_feature_,product_description
0,0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
1,1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
2,2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DeckOver,"15 Application Method Brush,Rol...",BEHR Premium Textured DECKOVER is an innovativ...
3,3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Delta,50 Bath Faucet Type Combo Tub a...,Update your bathroom with the Delta Vero Singl...
4,4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Delta,50 Bath Faucet Type Combo Tub a...,Update your bathroom with the Delta Vero Singl...


In [9]:
merge_data = fill_brand(merge_data)
merge_data = fill_attributes(merge_data)

In [10]:
merge_data

Unnamed: 0,index,id,product_uid,product_title,search_term,relevance,brand,combine_feature_,product_description
0,0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.00,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
1,1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.50,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
2,2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.00,BEHR Premium Textured DeckOver,"15 Application Method Brush,Rol...",BEHR Premium Textured DECKOVER is an innovativ...
3,3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Delta,50 Bath Faucet Type Combo Tub a...,Update your bathroom with the Delta Vero Singl...
4,4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Delta,50 Bath Faucet Type Combo Tub a...,Update your bathroom with the Delta Vero Singl...
...,...,...,...,...,...,...,...,...,...
240755,166688,240756,224424,stufurhome Norma 24 in. W x 16 in. D x 34 in. ...,24 whtie storage cabinet,,stufurhome,1719301 Assemb...,Create a neat yet stylish storage space for or...
240756,166689,240757,224425,Home Decorators Collection 49 in. D Alessandro...,adirondeck cusion,,Home Decorators Collection,1719319 Assembled Dep...,Our Bullnose Adirondack Chair Cushions fit Adi...
240757,166690,240758,224426,Simpson Strong-Tie HB 3-1/2 x 14 in. Top Flang...,hb,,Simpson Strong-Tie,Joist hangers are designed to provide support ...,Joist hangers are designed to provide support ...
240758,166691,240759,224427,1/4 in. -20 tpi x 1-1/2 in. Stainless Steel Bu...,hex sockets,,1/4,These socket cap screws are ideal for applicat...,These socket cap screws are ideal for applicat...


In [11]:
data_1 = merge_data.copy()

In [12]:
data_1 = data_1.sort_values('relevance', ascending=False).drop_duplicates('product_uid').sort_index()

In [13]:
data_1

Unnamed: 0,index,id,product_uid,product_title,search_term,relevance,brand,combine_feature_,product_description
0,0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.00,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
2,2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.00,BEHR Premium Textured DeckOver,"15 Application Method Brush,Rol...",BEHR Premium Textured DECKOVER is an innovativ...
4,4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Delta,50 Bath Faucet Type Combo Tub a...,Update your bathroom with the Delta Vero Singl...
7,7,21,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwaves,3.00,Whirlpool,75 Appliance Type Over the Range...,Achieving delicious results is almost effortle...
8,8,23,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,emergency light,2.67,Lithonia Lighting,123 Battery Power T...,The Quantum Adjustable 2-Light LED Black Emerg...
...,...,...,...,...,...,...,...,...,...
240755,166688,240756,224424,stufurhome Norma 24 in. W x 16 in. D x 34 in. ...,24 whtie storage cabinet,,stufurhome,1719301 Assemb...,Create a neat yet stylish storage space for or...
240756,166689,240757,224425,Home Decorators Collection 49 in. D Alessandro...,adirondeck cusion,,Home Decorators Collection,1719319 Assembled Dep...,Our Bullnose Adirondack Chair Cushions fit Adi...
240757,166690,240758,224426,Simpson Strong-Tie HB 3-1/2 x 14 in. Top Flang...,hb,,Simpson Strong-Tie,Joist hangers are designed to provide support ...,Joist hangers are designed to provide support ...
240758,166691,240759,224427,1/4 in. -20 tpi x 1-1/2 in. Stainless Steel Bu...,hex sockets,,1/4,These socket cap screws are ideal for applicat...,These socket cap screws are ideal for applicat...


In [14]:
data_1.isnull().sum()

index                      0
id                         0
product_uid                0
product_title              0
search_term                0
relevance              69761
brand                      0
combine_feature_           0
product_description        0
dtype: int64

In [15]:
data_1.drop(['relevance'],axis=1,inplace=True)

In [16]:
data_1

Unnamed: 0,index,id,product_uid,product_title,search_term,brand,combine_feature_,product_description
0,0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
2,2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,BEHR Premium Textured DeckOver,"15 Application Method Brush,Rol...",BEHR Premium Textured DECKOVER is an innovativ...
4,4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,Delta,50 Bath Faucet Type Combo Tub a...,Update your bathroom with the Delta Vero Singl...
7,7,21,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwaves,Whirlpool,75 Appliance Type Over the Range...,Achieving delicious results is almost effortle...
8,8,23,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,emergency light,Lithonia Lighting,123 Battery Power T...,The Quantum Adjustable 2-Light LED Black Emerg...
...,...,...,...,...,...,...,...,...
240755,166688,240756,224424,stufurhome Norma 24 in. W x 16 in. D x 34 in. ...,24 whtie storage cabinet,stufurhome,1719301 Assemb...,Create a neat yet stylish storage space for or...
240756,166689,240757,224425,Home Decorators Collection 49 in. D Alessandro...,adirondeck cusion,Home Decorators Collection,1719319 Assembled Dep...,Our Bullnose Adirondack Chair Cushions fit Adi...
240757,166690,240758,224426,Simpson Strong-Tie HB 3-1/2 x 14 in. Top Flang...,hb,Simpson Strong-Tie,Joist hangers are designed to provide support ...,Joist hangers are designed to provide support ...
240758,166691,240759,224427,1/4 in. -20 tpi x 1-1/2 in. Stainless Steel Bu...,hex sockets,1/4,These socket cap screws are ideal for applicat...,These socket cap screws are ideal for applicat...


In [17]:
data_2 = data_1.copy()

In [20]:
data_2['product_title'] = data_2['product_title'].apply(lambda x: preprocessing(x))
data_2['search_term'] = data_2['search_term'].apply(lambda x: preprocessing(x)) 
data_2['brand'] = data_2['brand'].apply(lambda x: preprocessing(x))
data_2['combine_feature_'] = data_2['combine_feature_'].apply(lambda x: preprocessing(x))
data_2['product_description'] =data_2['product_description'].apply(lambda x: preprocessing(x))

data_2['product_title'] = data_2['product_title'].apply(lambda x: stopwords_stemming(x))
data_2['search_term'] = data_2['search_term'].apply(lambda x: stemming_search(x))
data_2['brand'] = data_2['brand'].apply(lambda x: stopwords_stemming(x))
data_2['combine_feature_'] = data_2['combine_feature_'].apply(lambda x: stopwords_stemming(x))
data_2['product_description'] = data_2['product_description'].apply(lambda x: stopwords_stemming(x))

In [21]:
data_2.head(3)

Unnamed: 0,index,id,product_uid,product_title,search_term,brand,combine_feature_,product_description
0,0,2,100001,simpson strong tie 12 gaug angl,angl bracket,simpson strong tie,0 bullet 01 versatil connector variou 90 1 bul...,angl make joint stronger also provid consist s...
2,2,9,100002,behr premium textur deckov 1 gallon sc 141 tug...,deck over,behr premium textur deckov,15 applic method brush roller spray 16 assembl...,behr premium textur deckov innov solid color c...
4,4,17,100005,delta vero 1 handl shower faucet trim kit chro...,shower onli faucet,delta,50 bath faucet type combo tub shower 51 built ...,updat bathroom delta vero singl handl shower f...


In [22]:
data_2['product_info']=data_2['product_title']+' '+data_2['brand']+' '+data_2['product_description']

In [23]:
data_2.head(3)

Unnamed: 0,index,id,product_uid,product_title,search_term,brand,combine_feature_,product_description,product_info
0,0,2,100001,simpson strong tie 12 gaug angl,angl bracket,simpson strong tie,0 bullet 01 versatil connector variou 90 1 bul...,angl make joint stronger also provid consist s...,simpson strong tie 12 gaug angl simpson strong...
2,2,9,100002,behr premium textur deckov 1 gallon sc 141 tug...,deck over,behr premium textur deckov,15 applic method brush roller spray 16 assembl...,behr premium textur deckov innov solid color c...,behr premium textur deckov 1 gallon sc 141 tug...
4,4,17,100005,delta vero 1 handl shower faucet trim kit chro...,shower onli faucet,delta,50 bath faucet type combo tub shower 51 built ...,updat bathroom delta vero singl handl shower f...,delta vero 1 handl shower faucet trim kit chro...


In [24]:
data_2['product_info'][0]

'simpson strong tie 12 gaug angl simpson strong tie angl make joint stronger also provid consist straight corner simpson strong tie offer wide varieti angl variou size thick handl light duti job project structur connect need bent skew match project outdoor project moistur present use zmax zinc coat connector provid extra resist corros look z end model number versatil connector variou 90 connect home repair projectsstrong angl nail screw fasten alonehelp ensur joint consist straight strongdimens 3 inch x 3 inch x 1 1 2 inchmad 12 gaug steelgalvan extra corros resistanceinstal 10 common nail 9 x 1 1 2 inch strong drive sd screw'

In [25]:
corpus=data_2['product_title']+' '+data_2['brand']+' '+data_2['product_description']

In [26]:
np.savetxt('G:/Final Data_1/dataset_title_brand_descrip.txt',corpus.values, fmt='%s')

In [27]:
# http://norvig.com/spell-correct.html
# Spell Corrector
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('G:/Final Data_1/dataset_title_brand_descrip.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def corrected_terms(text):
    temp = text.split()
    temp = [correction(word) for word in temp]
    return ' '.join(temp)

In [28]:
from rank_bm25 import BM25Okapi
corpus = data_2['product_info'].values
tokenized_corpus = [doc.split(" ") for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

In [29]:
with open('G:/Final Data_1/BM25_model.pkl', 'wb') as f:
    pickle.dump(bm25, f)

In [30]:
query = 'aor condition'
corrected_search_term=corrected_terms(query)
print(corrected_search_term)
tokenized_query = corrected_search_term.split(" ")
bm25.get_top_n(tokenized_query, corpus, n=3)

air condition


['duck cover elit 34 inch round air condit cover duck cover duck cover air condition cover provid breakthrough protect keep air condition protect use innov multi layer materi creat superior airflow air condition cover air condition elimin condens damag outdoor condens duck cover crack fade time migrat best air condition cover today air condition cover 34 inch dia x 30 inch h .100 waterproof air condition cover like water duck backbreath uv treat materi use duck cover patio heater cover materi crack cold weathereasi use lightweight materi that easi fold store 2 inch wide velcro strap secur air condition cover place',
 'duck cover elit 34 inch squar air condit cover duck cover duck cover air condition cover provid breakthrough protect keep air condition protect use innov multi layer materi creat superior airflow air condition cover air condition elimin condens damag outdoor condens duck cover crack fade time migrat best air condition cover today air condition cover 34 inch w x 34 inch l 

In [31]:
data_2=data_2.drop(['index','id'],axis=1)

In [32]:
with open('G:/Final Data_1/Final_Database_bm25.pkl', 'wb') as f:
    pickle.dump(data_2, f)