# Data Preparation

This workbook is mainly to prepare test data in the same form of the processed train data for final predictions for Kaggle submission.

A lot of the steps are just a repetition of what used in the data_expo section.

In [1]:
import pickle
import re
import string
import warnings
from collections import Counter, OrderedDict

import en_core_web_md
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import spacy
import swifter
from fuzzywuzzy import fuzz, process
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from spacy.lang.en import English

stop = stopwords.words('english')
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('mode.chained_assignment', None)

In [2]:
df = pd.read_csv('./data/test.csv')
print(df.shape)

(22513, 4)


In [3]:
df['prod_clean'] = df['product_title'].str.lower().apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['prod_clean'] = df['prod_clean'].apply(
    lambda x: x.strip(string.punctuation))
df['prod_clean'] = df['prod_clean'].str.replace('\d+', '')

df['desc_clean'] = df['product_description'].str.lower().fillna('none').apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['desc_clean'] = df['desc_clean'].apply(
    lambda x: x.strip(string.punctuation))
df['desc_clean'] = df['desc_clean'].str.replace('\d+', '')

In [4]:
lemmat = WordNetLemmatizer()
df['prod_lemat'] = df.apply(lambda row: lemmat.lemmatize(row['prod_clean']),
                            axis=1)
df['desc_lemat'] = df.apply(lambda row: lemmat.lemmatize(row['desc_clean']),
                            axis=1)

In [None]:
df['prod_token'] = df.apply(lambda row: word_tokenize(row['prod_lemat']),
                            axis=1)
df['desc_token'] = df.apply(lambda row: word_tokenize(row['desc_lemat']),
                            axis=1)

In [None]:
df['join_text'] = df['prod_token'] + df['desc_token']

In [None]:
df.head()

In [None]:
df['text'] = df['join_text'].apply(lambda x: ' '.join(map(str, x)))

In [None]:
df['text_fin'] = (df['text'].str.split().apply(
    lambda x: OrderedDict.fromkeys(x).keys()).str.join(' '))

In [None]:
df_select = df[['id', 'query', 'text_fin']]

In [None]:
df_select.head()

In [None]:
nlp = en_core_web_md.load()

In [None]:
df_select['q_nlp'] = df_select['query'].swifter.apply(nlp)

In [None]:
df_select['t_nlp'] = df_select['text_fin'].swifter.apply(nlp)

In [None]:
df_select['sim'] = [
    df_select['q_nlp'][i].similarity(df_select['t_nlp'][i])
    for i in range(len(df_select))
]

In [None]:
df_select['fuzzy'] = [
    fuzz.partial_ratio(df_select['query'][x], df_select['text_fin'][x])
    for x in range(len(df_select))
]

In [None]:
df_select['query_len'] = df_select['query'].str.count(' ') + 1

In [None]:
# loading the keywords list as generated in the expolaratory stage.

tup_list = pickle.load(open('data/tup_list.pkl', "rb"))

In [None]:
for term in tup_list:
    df_select[term] = df_select['query'].str.contains(term)

In [None]:
df_select.set_index('id').to_csv('./data/df_test.csv')