## Importing Packages

In [39]:
import os
import sys
import pandas as pd
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
import matplotlib.pyplot as plt
import string
import re
import numpy as np
import itertools
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
from nltk import WordNetLemmatizer # lemmatizer using WordNet
from nltk.corpus import wordnet # imports WordNet
from nltk import pos_tag # nltk's native part of speech tagging


In [40]:
df = pd.read_csv('fake reviews dataset.csv')
df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


## Data Undersatding

In [41]:
df['category'].value_counts()

category
Kindle_Store_5                  4730
Books_5                         4370
Pet_Supplies_5                  4254
Home_and_Kitchen_5              4056
Electronics_5                   3988
Sports_and_Outdoors_5           3946
Tools_and_Home_Improvement_5    3858
Clothing_Shoes_and_Jewelry_5    3848
Toys_and_Games_5                3794
Movies_and_TV_5                 3588
Name: count, dtype: int64

In [42]:
df.describe()

Unnamed: 0,rating
count,40432.0
mean,4.256579
std,1.144354
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


## Text Preprocessing: Tokenization

In [43]:
#accessing and storing the content of the first comment in the 'text_' column
first_doc = df['text_'].iloc[0]
first_doc

'Love this!  Well made, sturdy, and very comfortable.  I love it!Very pretty'

In [44]:
print(word_tokenize(first_doc, language='english'))
first_doc

['Love', 'this', '!', 'Well', 'made', ',', 'sturdy', ',', 'and', 'very', 'comfortable', '.', 'I', 'love', 'it', '!', 'Very', 'pretty']


'Love this!  Well made, sturdy, and very comfortable.  I love it!Very pretty'

In [45]:
#tokenizing the text data in the 'text_' column of df
corpus = [word_tokenize(doc) for doc in df['text_']]
print(corpus[0:4])

[['Love', 'this', '!', 'Well', 'made', ',', 'sturdy', ',', 'and', 'very', 'comfortable', '.', 'I', 'love', 'it', '!', 'Very', 'pretty'], ['love', 'it', ',', 'a', 'great', 'upgrade', 'from', 'the', 'original', '.', 'I', "'ve", 'had', 'mine', 'for', 'a', 'couple', 'of', 'years'], ['This', 'pillow', 'saved', 'my', 'back', '.', 'I', 'love', 'the', 'look', 'and', 'feel', 'of', 'this', 'pillow', '.'], ['Missing', 'information', 'on', 'how', 'to', 'use', 'it', ',', 'but', 'it', 'is', 'a', 'great', 'product', 'for', 'the', 'price', '!', 'I']]


In [46]:
# flattening the list of tokenized words contained in the corpus list
flattenedcorpus_tokens = pd.Series(list(itertools.chain(*corpus)))
print(flattenedcorpus_tokens.shape)

(3099953,)


In [47]:
dictionary = pd.Series(
    flattenedcorpus_tokens.unique())
print(len(dictionary))

60431


### Dealing with Stop words + lowecase

In [48]:
# getting common stop words in english that we'll remove during tokenization/text normalization
stop_words = stopwords.words('english')
print(stop_words[0:5])

['i', 'me', 'my', 'myself', 'we']


In [49]:
def first_step_normalizer(doc):
    norm_text = [x.lower() for x in word_tokenize(doc) if ((x.isalpha()) & (x not in stop_words)) ]
    return norm_text

In [50]:
df['tok_norm'] = df['text_'].apply(first_step_normalizer)
df.head()

Unnamed: 0,category,rating,label,text_,tok_norm
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor...","[love, well, made, sturdy, comfortable, i, lov..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I...","[love, great, upgrade, original, i, mine, coup..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...,"[this, pillow, saved, back, i, love, look, fee..."
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i...","[missing, information, use, great, product, pr..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...,"[very, nice, set, good, quality, we, set, two,..."


In [51]:
norm_toks_flattened = pd.Series(list(
    itertools.chain(*df['tok_norm'])))
new_dictionary = norm_toks_flattened.unique()
print(len(new_dictionary))

37936


In [52]:
print(len(dictionary))

60431


- Process removed 22.500 features

## Text Preprocessing: Lemmatization

#### We created function which takes in untokenized document and returns fully normalized token list

In [53]:
def process_doc(doc):

    wnl = WordNetLemmatizer()

    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
        
    # remove stop words and punctuations, then lower case
    doc_norm = [tok.lower() for tok in word_tokenize(doc) if ((tok.isalpha()) & (tok not in stop_words)) ]

    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc_norm))) 
    doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]
    
    return doc_norm

### Applying text Tokenization/Normalization to whole body of df

In [54]:
fully_normalized_corpus = df['text_'].apply(process_doc)

In [55]:
fully_normalized_corpus.head()

0    [love, well, make, sturdy, comfortable, i, lov...
1    [love, great, upgrade, original, i, mine, coup...
2    [pillow, save, back, i, love, look, feel, pillow]
3    [miss, information, use, great, product, price...
4         [very, nice, set, good, quality, set, month]
Name: text_, dtype: object

In [60]:
flattened_fully_norm = pd.Series(list(itertools.chain(*fully_normalized_corpus)))
len(flattened_fully_norm.unique())

31587

In [57]:
flattened_fully_norm

0                 love
1                 well
2                 make
3               sturdy
4          comfortable
              ...     
1272945    comfortable
1272946           shoe
1272947           wear
1272948           walk
1272949            day
Length: 1272950, dtype: object

In [59]:
# flattening the lists
fnc_output = fully_normalized_corpus.apply(" ".join)
fnc_output

0        love well make sturdy comfortable i love very ...
1           love great upgrade original i mine couple year
2                 pillow save back i love look feel pillow
3               miss information use great product price i
4                     very nice set good quality set month
                               ...                        
40427    i read review say bra run small i order band c...
40428    i sure exactly little large small size i think...
40429    wear hood wear hood wear jacket hood system re...
40430    i like nothing dress reason i give star i orde...
40431    i work wed industry work long day foot outside...
Name: text_, Length: 40432, dtype: object