## Importing Packages

In [99]:
import os
import sys
import pandas as pd
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
import matplotlib.pyplot as plt
import string
import re
import numpy as np
import itertools
from nltk import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score, classification_report
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
import xgboost

ModuleNotFoundError: No module named 'xgboost'

In [2]:
df = pd.read_csv('fake reviews dataset.csv')
df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


## Data Undersatding

In [3]:
df['category'].value_counts()

Kindle_Store_5                  4730
Books_5                         4370
Pet_Supplies_5                  4254
Home_and_Kitchen_5              4056
Electronics_5                   3988
Sports_and_Outdoors_5           3946
Tools_and_Home_Improvement_5    3858
Clothing_Shoes_and_Jewelry_5    3848
Toys_and_Games_5                3794
Movies_and_TV_5                 3588
Name: category, dtype: int64

In [4]:
df.describe()

Unnamed: 0,rating
count,40432.0
mean,4.256579
std,1.144354
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [73]:
df['target'] = np.where(df['label'] == 'CG', 1, 0)

In [74]:
df

Unnamed: 0,category,rating,label,text_,text_preproccesed,target
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor...",love well make sturdy comfortable i love very ...,1
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I...",love great upgrade original i mine couple year,1
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...,this pillow save back i love look feel pillow,1
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i...",miss information use great product price i,1
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...,very nice set good quality we set two month,1
...,...,...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...,i read review say bra run small i order two ba...,0
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...,i sure exactly would it little large small siz...,1
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ...",you wear hood wear hood wear jacket without ho...,0
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...,i like nothing dress the reason i give star i ...,1


## Text Preprocessing: Tokenization

In [32]:
#tokenizing the text data in the 'text_' column of df
def tokenizer(x):
    
    corpus = [word_tokenize(doc) for doc in x]

# getting common stop words in english that we'll remove during tokenization/text normalization
    stop_words = stopwords.words('english')
    corpus_no_stopwords = []
    for words in corpus:
        docs = [x.lower() for x in words if ((x.isalpha()) & (x not in stop_words))]
        corpus_no_stopwords.append(docs)
    return corpus_no_stopwords

## Lemmantizer

In [62]:
def lemmatizer(corpus, as_string=True):
    lem = WordNetLemmatizer()
    
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
    lemmatized_corpus = []
    for sentence in corpus:
        pos_tags = pos_tag(sentence)
        lemmatized_sentence = []
        for word, tag in pos_tags:
            pos = pos_tagger(tag)
            if pos is not None:
                lemmatized_word = lem.lemmatize(word, pos)
            else:
                lemmatized_word = lem.lemmatize(word)
            lemmatized_sentence.append(lemmatized_word)
        lemmatized_corpus.append(lemmatized_sentence)
    if as_string:
        lemmatized_corpus  = [' '.join(x) for x in lemmatized_corpus]
    return lemmatized_corpus
        

In [27]:
corpus_tokenized = tokenizer(df['text_'])

In [63]:
lemmatized_corpus = lemmatizer(corpus_tokenized)

## Pre vectorizing

In [41]:
joined_lemm_corpus = [' '.join(x) for x in lemmatized_corpus]
df['text_preproccesed'] = pd.Series(data=lemmatized_corpus)

In [65]:
vec = CountVectorizer(min_df = 0.05, max_df = 0.95)
X = vec.fit_transform(lemmatized_corpus)
countvec_df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

In [68]:
tfidf = TfidfVectorizer(min_df = 0.05, max_df = 0.95)
Y = tfidf.fit_transform(lemmatized_corpus)
tfidf_df = pd.DataFrame(Y.toarray(), columns=tfidf.get_feature_names())

In [69]:
tfidf_df

Unnamed: 0,also,anyone,best,big,bit,book,buy,ca,character,come,...,try,two,use,want,way,well,work,would,write,year
0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.365431,0.000000,0.000000,0.0,0.000000
1,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.728227
2,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.436464,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.350925,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40427,0.000000,0.0,0.112657,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.103073,...,0.115232,0.206830,0.000000,0.099786,0.000000,0.075666,0.000000,0.077660,0.0,0.000000
40428,0.000000,0.0,0.000000,0.091199,0.000000,0.0,0.13296,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.061434,0.067497,0.252213,0.0,0.000000
40429,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.174608,0.000000,0.000000,0.000000,0.000000,0.114655,0.000000,0.117676,0.0,0.000000
40430,0.064548,0.0,0.000000,0.000000,0.145785,0.0,0.11615,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.107335,0.000000,0.055082,0.0,0.000000


In [45]:
X

<40432x68 sparse matrix of type '<class 'numpy.int64'>'
	with 329640 stored elements in Compressed Sparse Row format>

In [75]:
X_train, X_test, y_train,y_test = train_test_split(df['text_'],df['target'], test_size=0.3,random_state=42)

In [81]:
X_train_preprocessed = lemmatizer(tokenizer(X_train))
X_test_preprocessed = lemmatizer(tokenizer(X_test))

In [91]:
steps = [('countvec',CountVectorizer(min_df = 0.05, max_df = 0.95)),('rfc',RandomForestClassifier(n_estimators=200))]
pipe = Pipeline(steps)
pipe.fit(X_train_preprocessed, y_train)
y_pred = pipe.predict(X_test_preprocessed)

In [None]:
steps = [('tfidfvec',TfidfVectorizer(min_df = 0.05, max_df = 0.95)),('rfc',RandomForestClassifier(n_estimators=200))]
pipe = Pipeline(steps)
pipe.fit(X_train_preprocessed, y_train)
y_pred = pipe.predict(X_test_preprocessed)

In [93]:
steps = [('tfidfvec',TfidfVectorizer(min_df = 0.05, max_df = 0.95)),('logreg',LogisticRegression())]
pipe = Pipeline(steps)
pipe.fit(X_train_preprocessed, y_train)
y_pred = pipe.predict(X_test_preprocessed)

In [None]:
steps = [('tfidfvec',TfidfVectorizer(min_df = 0.05, max_df = 0.95)),('logreg',LogisticRegression())]
pipe = Pipeline(steps)
pipe.fit(X_train_preprocessed, y_train)
y_pred = pipe.predict(X_test_preprocessed)

In [98]:
steps = [('tfidfvec',TfidfVectorizer(min_df = 0.05, max_df = 0.95)),('xgboost',xgboost.XGBClassifier(random_state=42, objective='binary:logistic'))]
pipe = Pipeline(steps)
pipe.fit(X_train_preprocessed, y_train)
y_pred = pipe.predict(X_test_preprocessed)

NameError: name 'xgboost' is not defined

In [97]:
classification_report(y_test,y_pred,output_dict=True)

{'0': {'precision': 0.7330609465440926,
  'recall': 0.7100923482849604,
  'f1-score': 0.7213938683196515,
  'support': 6064},
 '1': {'precision': 0.7189897698209718,
  'recall': 0.7415100560501154,
  'f1-score': 0.7300762863171564,
  'support': 6066},
 'accuracy': 0.7258037922506183,
 'macro avg': {'precision': 0.7260253581825322,
  'recall': 0.725801202167538,
  'f1-score': 0.7257350773184039,
  'support': 12130},
 'weighted avg': {'precision': 0.726024198151475,
  'recall': 0.7258037922506183,
  'f1-score': 0.7257357930989478,
  'support': 12130}}

In [53]:
flattenedcorpus_tokens = pd.Series(list(itertools.chain(*lemmatized_corpus)))
tokens_unique = pd.Series(flattenedcorpus_tokens.unique())

In [54]:
tokens_unique

0               love
1               well
2               make
3             sturdy
4        comfortable
            ...     
31966    gallbladder
31967        hippora
31968         hyvent
31969            dwr
31970        merrels
Length: 31971, dtype: object

In [None]:
 AdaBoostClassifier, GradientBoostingClassifier

### Dealing with Stop words + lowercase

In [9]:
len(tokens_no_stop_words)

47953

In [10]:
df['tok_norm'] = df['text_'].apply(first_step_normalizer)
df.head()

NameError: name 'first_step_normalizer' is not defined

In [None]:
norm_toks_flattened = pd.Series(list(
    itertools.chain(*df['tok_norm'])))
new_dictionary = norm_toks_flattened.unique()
print(len(new_dictionary))

In [None]:
print(len(dictionary))

- Process removed 22.500 features

## Text Preprocessing: Lemmatization

#### We created function which takes in untokenized document and returns fully normalized token list

In [None]:
def process_doc(doc):

    wnl = WordNetLemmatizer()

    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
        
    # remove stop words and punctuations, then lower case
    doc_norm = [tok.lower() for tok in word_tokenize(doc) if ((tok.isalpha()) & (tok not in stop_words)) ]

    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc_norm))) 
    doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]
    
    return doc_norm

### Applying text Tokenization/Normalization to whole body of df

In [None]:
fully_normalized_corpus = df['text_'].apply(process_doc)

In [None]:
fully_normalized_corpus.head()

In [None]:
flattened_fully_norm = pd.Series(list(itertools.chain(*fully_normalized_corpus)))
len(flattened_fully_norm.unique())

In [None]:
flattened_fully_norm

In [None]:
# flattening the lists
fnc_output = fully_normalized_corpus.apply(" ".join)
fnc_output