In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle
import zipfile
from datetime import datetime
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost
from scipy.sparse import hstack
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge,Lasso,ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SAPEKSHA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Data

In [2]:
train_data = zipfile.ZipFile('G:/Applied_AI/case_study_1/train.csv.zip')
train_data = pd.read_csv(train_data.open('train.csv'),encoding = "ISO-8859-1")
print('train_data',train_data.shape)
attribute_data = zipfile.ZipFile('G:/Applied_AI/case_study_1/attributes.csv.zip')
attribute_data = pd.read_csv(attribute_data.open('attributes.csv'),encoding = "ISO-8859-1")
print('Attribute_data',attribute_data.shape)
description_data = zipfile.ZipFile('G:/Applied_AI/case_study_1/product_descriptions.csv.zip')
description_data = pd.read_csv(description_data.open('product_descriptions.csv'),encoding = "ISO-8859-1")
print('description_data',description_data.shape)

train_data (74067, 5)
Attribute_data (2044803, 3)
description_data (124428, 2)


## Pre-processing

* Merge data

In [3]:
dataset = train_data.copy()

In [4]:
def merge_attributes(df):
    attr = attribute_data.copy()
    product_uid = df['product_uid'].values
    
    temp = attr.loc[attr['product_uid'].isin(product_uid)] 
    temp['combine_feature'] = temp['name'] + ' ' + temp['value']
    
    brands = temp[temp['name']=='MFG Brand Name']
    brands['brand'] = brands['value']
    brands.drop(['name','value','combine_feature'],axis=1,inplace=True)

    temp= temp.merge(brands,on='product_uid',how='left')
    temp['combine_feature_'] = temp.groupby('product_uid')['combine_feature'].transform(lambda x :''.join(str(x)))
    temp = temp.drop_duplicates(subset=['product_uid'])
    df = df.merge(temp,on='product_uid',how='left').set_index(df.index)
    df.drop(['name','value','combine_feature'],axis=1,inplace=True)
    return df


In [5]:
dataset = merge_attributes(dataset)
dataset.head(3)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,brand,combine_feature_
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DeckOver,"15 Application Method Brush,Rol..."


In [6]:
def merge_description(df):
    descrip = description_data.copy()
    product_uid = df['product_uid'].values
    temp = descrip.loc[descrip['product_uid'].isin(product_uid)]
    df = df.merge(temp,on='product_uid',how='left').set_index(df.index)
    return df

In [7]:
dataset = merge_description(dataset)
dataset.head(3)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,brand,combine_feature_,product_description
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,Simpson Strong-Tie,0 Bullet01 Versatile connector for various...,"Not only do angles make joints stronger, they ..."
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DeckOver,"15 Application Method Brush,Rol...",BEHR Premium Textured DECKOVER is an innovativ...


In [8]:
def extract_n_words(n,text):
    if n>len(text.split()):
        return 'invalid'
    return ' '.join(text.split()[:n])

def fill_brand(df):
    null_brand_values = df[df['brand'].isna()]
    unique_brands = df['brand'].unique()

    for i,j in null_brand_values.iterrows():
        title=j['product_title']
        if extract_n_words(6,title) in unique_brands:
            null_brand_values['brand'].loc[i] = extract_n_words(6, title)
        elif extract_n_words(5,title) in unique_brands:
            null_brand_values['brand'].loc[i] = extract_n_words(5, title)
        elif extract_n_words(4,title) in unique_brands:
            null_brand_values['brand'].loc[i] = extract_n_words(4, title)
        elif extract_n_words(3,title) in unique_brands:
            null_brand_values['brand'].loc[i] = extract_n_words(3, title)
        elif extract_n_words(2,title) in unique_brands:
            null_brand_values['brand'].loc[i] = extract_n_words(2, title)
        else:
            null_brand_values['brand'].loc[i] = extract_n_words(1, title)
            
    df['brand'].loc[null_brand_values.index]=null_brand_values['brand'].values
    return df

In [9]:
dataset = fill_brand(dataset)

In [10]:
def fill_attributes(df):
    null_df = df[df['combine_feature_'].isna()]
    null_df['combine_feature_'] = null_df['product_description'].copy()
    df['combine_feature_'].loc[null_df.index] = null_df['combine_feature_'].values
    return df


In [11]:
dataset = fill_attributes(dataset)

In [12]:
#Reference : https://towardsdatascience.com/modeling-product-search-relevance-in-e-commerce-home-depot-case-study-8ccb56fbc5ab
def standardize_units(text):
    text = " " + text + " "
    text = re.sub('( gal | gals | galon )',' gallon ',text)
    text = re.sub('( ft | fts | feets | foot | foots )',' feet ',text)
    text = re.sub('( squares | sq )',' square ',text)
    text = re.sub('( lb | lbs | pounds )',' pound ',text)
    text = re.sub('( oz | ozs | ounces | ounc )',' ounce ',text)
    text = re.sub('( yds | yd | yards )',' yard ',text)
    return text

def preprocessing(text):
    
    text = text.replace('in.','inch')  # Replace in. with inch
    text = re.sub('[^A-Za-z0-9.]+',' ',text) # remove special characters except '.'
    text = re.sub(r"(?<!\d)[.,;:](?!\d)",'',text,0) # https://stackoverflow.com/questions/43142710/remove-all-punctuation-from-string-except-if-its-between-digits
    text = re.sub("[A-Za-z]+", lambda ele: " " + ele[0] + " ", text)
    text = standardize_units(text)
    text = text.lower()
    text = ' '.join(text.split())
    return text

In [13]:
stop_words = stopwords.words('english')
ps = PorterStemmer()

def stopwords_stemming(text):
    words = text.split()
    words = [w for w in words if w not in stop_words] # Stopwords
    words = [ps.stem(word) for word in words] # stemming
    return ' '.join(words)

def stemming_search(text):
    words = text.split()
    words = [ps.stem(word) for word in words] # stemming
    return ' '.join(words)

In [14]:
data = dataset.copy()

In [15]:
data['product_title'] = data['product_title'].apply(lambda x: preprocessing(x))
data['search_term'] = data['search_term'].apply(lambda x: preprocessing(x)) 
data['brand'] = data['brand'].apply(lambda x: preprocessing(x))
data['combine_feature_'] = data['combine_feature_'].apply(lambda x: preprocessing(x))
data['product_description'] =data['product_description'].apply(lambda x: preprocessing(x))

"""
furthur preprocessing
"""
data['product_title'] = data['product_title'].apply(lambda x: stopwords_stemming(x))
data['search_term'] = data['search_term'].apply(lambda x: stemming_search(x))
data['brand'] = data['brand'].apply(lambda x: stopwords_stemming(x))
data['combine_feature_'] = data['combine_feature_'].apply(lambda x: stopwords_stemming(x))
data['product_description'] = data['product_description'].apply(lambda x: stopwords_stemming(x))

In [16]:
print(data.shape)
data.head()

(74067, 8)


Unnamed: 0,id,product_uid,product_title,search_term,relevance,brand,combine_feature_,product_description
0,2,100001,simpson strong tie 12 gaug angl,angl bracket,3.0,simpson strong tie,0 bullet 01 versatil connector variou 90 1 bul...,angl make joint stronger also provid consist s...
1,3,100001,simpson strong tie 12 gaug angl,l bracket,2.5,simpson strong tie,0 bullet 01 versatil connector variou 90 1 bul...,angl make joint stronger also provid consist s...
2,9,100002,behr premium textur deckov 1 gallon sc 141 tug...,deck over,3.0,behr premium textur deckov,15 applic method brush roller spray 16 assembl...,behr premium textur deckov innov solid color c...
3,16,100005,delta vero 1 handl shower faucet trim kit chro...,rain shower head,2.33,delta,50 bath faucet type combo tub shower 51 built ...,updat bathroom delta vero singl handl shower f...
4,17,100005,delta vero 1 handl shower faucet trim kit chro...,shower onli faucet,2.67,delta,50 bath faucet type combo tub shower 51 built ...,updat bathroom delta vero singl handl shower f...


In [17]:
data.isnull().sum()

id                     0
product_uid            0
product_title          0
search_term            0
relevance              0
brand                  0
combine_feature_       0
product_description    0
dtype: int64

In [18]:
with open('G:/Final Data_1/clean_dataset_.pkl','wb') as f:
    pickle.dump(data,f)