# Data Preparation

This workbook is mainly to prepare test data in the same form of the processed train data for final predictions for Kaggle submission.

A lot of the steps are just a repetition of what used in the data_expo section.

In [1]:
import pickle
import re
import string
import warnings
from collections import Counter, OrderedDict

import en_core_web_md
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import spacy
import swifter
from fuzzywuzzy import fuzz, process
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from spacy.lang.en import English

stop = stopwords.words('english')
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('mode.chained_assignment', None)

In [2]:
df = pd.read_csv('./data/test.csv')

In [3]:
df['prod_clean'] = df['product_title'].str.lower().apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['prod_clean'] = df['prod_clean'].apply(
    lambda x: x.strip(string.punctuation))
df['prod_clean'] = df['prod_clean'].str.replace('\d+', '')

df['desc_clean'] = df['product_description'].str.lower().fillna('none').apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['desc_clean'] = df['desc_clean'].apply(
    lambda x: x.strip(string.punctuation))
df['desc_clean'] = df['desc_clean'].str.replace('\d+', '')

In [4]:
lemmat = WordNetLemmatizer()
df['prod_lemat'] = df.apply(lambda row: lemmat.lemmatize(row['prod_clean']),
                            axis=1)
df['desc_lemat'] = df.apply(lambda row: lemmat.lemmatize(row['desc_clean']),
                            axis=1)

In [5]:
df['prod_token'] = df.apply(lambda row: word_tokenize(row['prod_lemat']),
                            axis=1)
df['desc_token'] = df.apply(lambda row: word_tokenize(row['desc_lemat']),
                            axis=1)

In [6]:
df['join_text'] = df['prod_token'] + df['desc_token']

In [7]:
df.head()

Unnamed: 0,id,query,product_title,product_description,prod_clean,desc_clean,prod_lemat,desc_lemat,prod_token,desc_token,join_text
0,3,electric griddle,Star-Max 48 in Electric Griddle,,star-max electric griddle,none,star-max electric griddle,none,"[star-max, electric, griddle]",[none],"[star-max, electric, griddle, none]"
1,6,phillips coffee maker,Philips SENSEO HD7810 WHITE Single Serve Pod C...,,philips senseo hd white single serve pod coffe...,none,philips senseo hd white single serve pod coffe...,none,"[philips, senseo, hd, white, single, serve, po...",[none],"[philips, senseo, hd, white, single, serve, po..."
2,9,san francisco 49ers,2013 San Francisco 49ers Clock,A 2013 San Francisco 49ers clock is the ultima...,san francisco ers clock,san francisco ers clock ultimate way show tea...,san francisco ers clock,san francisco ers clock ultimate way show tea...,"[san, francisco, ers, clock]","[san, francisco, ers, clock, ultimate, way, sh...","[san, francisco, ers, clock, san, francisco, e..."
3,11,aveeno shampoo,AVEENO 10.5FLOZ NRSH SHINE SH,"Water, Ammonium Lauryl Sulfate, Dimethicone, S...",aveeno .floz nrsh shine sh,"water, ammonium lauryl sulfate, dimethicone, s...",aveeno .floz nrsh shine sh,"water, ammonium lauryl sulfate, dimethicone, s...","[aveeno, .floz, nrsh, shine, sh]","[water, ,, ammonium, lauryl, sulfate, ,, dimet...","[aveeno, .floz, nrsh, shine, sh, water, ,, amm..."
4,12,flea and tick control for dogs,Merial Frontline Plus Flea and Tick Control fo...,,merial frontline plus flea tick control dogs p...,none,merial frontline plus flea tick control dogs p...,none,"[merial, frontline, plus, flea, tick, control,...",[none],"[merial, frontline, plus, flea, tick, control,..."


In [8]:
df['text'] = df['join_text'].apply(lambda x: ' '.join(map(str, x)))

In [9]:
df['text_fin'] = (df['text'].str.split().apply(
    lambda x: OrderedDict.fromkeys(x).keys()).str.join(' '))

In [10]:
df_select = df[['id', 'query', 'text_fin']]

In [11]:
df_select.head()

Unnamed: 0,id,query,text_fin
0,3,electric griddle,star-max electric griddle none
1,6,phillips coffee maker,philips senseo hd white single serve pod coffe...
2,9,san francisco 49ers,san francisco ers clock ultimate way show team...
3,11,aveeno shampoo,"aveeno .floz nrsh shine sh water , ammonium la..."
4,12,flea and tick control for dogs,merial frontline plus flea tick control dogs p...


In [12]:
nlp = en_core_web_md.load()

In [13]:
df_select['q_nlp'] = df_select['query'].swifter.apply(nlp)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=22513.0, style=ProgressStyle(descripti…




In [14]:
df_select['t_nlp'] = df_select['text_fin'].swifter.apply(nlp)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=22513.0, style=ProgressStyle(descripti…




In [15]:
df_select['sim'] = [
    df_select['q_nlp'][i].similarity(df_select['t_nlp'][i])
    for i in range(len(df_select))
]

  df_select['q_nlp'][i].similarity(df_select['t_nlp'][i])


In [16]:
df_select['fuzzy'] = [
    fuzz.partial_ratio(df_select['query'][x], df_select['text_fin'][x])
    for x in range(len(df_select))
]

In [17]:
df_select['query_len'] = df_select['query'].str.count(' ') + 1

In [18]:
# loading the keywords list as generated in the expolaratory stage.

tup_list = pickle.load(open('data/tup_list.pkl', "rb"))

In [19]:
for term in tup_list:
    df_select[term] = df_select['query'].str.contains(term)

In [20]:
df_select.set_index('id').to_csv('./data/df_test.csv')