# Introduction

# Importing Libraries


In [0]:
import pandas as pd
import numpy as np

! python -m spacy download en_core_web_md
import spacy
spacy.prefer_gpu()

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb
from sklearn import svm

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt

import warnings; warnings.simplefilter('ignore')

# Functions

In [0]:
def flatten(lst):
    new_lst = []
    flatten_helper(lst, new_lst)
    return new_lst
 
def flatten_helper(lst, new_lst):
    for element in lst:
        if isinstance(element, list):
            flatten_helper(element, new_lst)
        else:
            new_lst.append(element)

In [0]:
def RemoveSmallWords(Series):
    t = Series.str.split(expand=True).stack()
    return t.loc[t.str.len() >= 4].groupby(level=0).apply(' '.join)

In [0]:
def TextPreProcessing (df,text_column):
  df.loc[df[text_column].isna(),text_column] = '-EMPTY-'
  # Creating Corpus with Spacy pipe.
  nlp = spacy.load('en_core_web_md')
  corpus = list(nlp.pipe(df[text_column]))
  a_lemmas = []
  for i in range(len(corpus)):
    try:
      #creating name entity recognition list for the especific corpus
      ents = [ent.text.split() for ent in corpus[i].ents]
      ents = flatten(ents)    
      # Tokenization with lemmatizer
      lemmas = [token.lemma_ for token in corpus[i]]
      # Removing non-alphabetic characters
      a_lemmas.append(pd.Series([lemma for lemma in lemmas if (lemma.isalpha() and nlp.vocab[lemma].is_stop==False)])) 
      a_lemmas[i] = a_lemmas[i][~a_lemmas[i].isin(pd.Series(ents))]
      a_lemmas[i] = RemoveSmallWords(a_lemmas[i])
      a_lemmas[i].reset_index(inplace = True, drop = True)
    except:
      a_lemmas[i] = '-EMPTY-'
    a_lemmas[i] = ' '.join(a_lemmas[i])
  df[text_column] = a_lemmas
  return df

# Importing Data

In [5]:
df = pd.read_csv('/content/drive/My Drive/ML Projects/Womens Clothing E-Commerce/Womens Clothing E-Commerce Reviews.csv')
df

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
...,...,...,...,...,...,...,...,...,...,...,...
23481,23481,1104,34,Great dress for many occasions,I was very happy to snag this dress at such a ...,5,1,0,General Petite,Dresses,Dresses
23482,23482,862,48,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",3,1,0,General Petite,Tops,Knits
23483,23483,1104,31,"Cute, but see through","This fit well, but the top was very see throug...",3,0,1,General Petite,Dresses,Dresses
23484,23484,1084,28,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3,1,2,General,Dresses,Dresses


# Spliting Data

In [6]:
train, test = train_test_split(df,test_size = 0.3,random_state=7)
test, val = train_test_split(test,test_size=0.5,random_state=7)

train.reset_index(inplace = True)
val.reset_index(inplace = True)
test.reset_index(inplace = True)

print(train.shape)
print(val.shape)
print(test.shape)

(16440, 12)
(3523, 12)
(3523, 12)


# Text Pre-Processing

In [0]:
train = TextPreProcessing (df = train,text_column = 'Review Text')