In [2]:
import time
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
pd.set_option('max_colwidth', None)

In [3]:
df = pd.read_csv('fake reviews dataset.csv', names=['category', 'rating', 'label', 'text'])

In [4]:
df.head()

Unnamed: 0,category,rating,label,text
0,category,rating,label,text_
1,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty"
2,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years"
3,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.
4,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I"


In [5]:
df['label'].value_counts()

label
OR       20358
CG       20236
label        1
Name: count, dtype: int64

In [6]:
df['text'] = df['text'].str.replace('\n', ' ')
df['target'] = np.where(df['label']=='CG', 1, 0)
df['target'].value_counts()

target
0    20359
1    20236
Name: count, dtype: int64

In [7]:
def punctuation_to_features(df, column):
    """Identify punctuation within a column and convert to a text representation.
    
    Args:
        df (object): Pandas dataframe.
        column (string): Name of column containing text. 
        
    Returns:
        df[column]: Original column with punctuation converted to text, 
                    i.e. "Wow! > "Wow exclamation"
    
    """
    
    df[column] = df[column].replace('!', ' exclamation ')
    df[column] = df[column].replace('?', ' question ')
    df[column] = df[column].replace('\'', ' quotation ')
    df[column] = df[column].replace('\"', ' quotation ')
    
    return df[column]

In [8]:
df['text'] = punctuation_to_features(df, 'text')

In [9]:
#Tokenize the data

In [10]:
nltk.download('punkt');

def tokenize(column):
    """Tokenizes a Pandas dataframe column and returns a list of tokens.
    
    Args:
        column: Pandas dataframe column (i.e. df['text']).
    
    Returns:
        tokens (list): Tokenized list, i.e. [Donald, Trump, tweets]
    
    """
    
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shashank.shetty\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
df['tokenized'] = df.apply(lambda x: tokenize(x['text']), axis=1)
df.head()

Unnamed: 0,category,rating,label,text,target,tokenized
0,category,rating,label,text_,0,[]
1,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty",1,"[Love, this, Well, made, sturdy, and, very, comfortable, I, love, it, Very, pretty]"
2,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years",1,"[love, it, a, great, upgrade, from, the, original, I, had, mine, for, a, couple, of, years]"
3,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.,1,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]"
4,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I",1,"[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, I]"


In [12]:
#Stopword removal

In [13]:
nltk.download('stopwords');

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shashank.shetty\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
def remove_stopwords(tokenized_column):
    """Return a list of tokens with English stopwords removed. 
    
    Args:
        column: Pandas dataframe column of tokenized data from tokenize()
    
    Returns:
        tokens (list): Tokenized list with stopwords removed.
    
    """
    stops = set(stopwords.words("english"))
    return [word for word in tokenized_column if not word in stops]

In [15]:
df['stopwords_removed'] = df.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
df.head()

Unnamed: 0,category,rating,label,text,target,tokenized,stopwords_removed
0,category,rating,label,text_,0,[],[]
1,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty",1,"[Love, this, Well, made, sturdy, and, very, comfortable, I, love, it, Very, pretty]","[Love, Well, made, sturdy, comfortable, I, love, Very, pretty]"
2,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years",1,"[love, it, a, great, upgrade, from, the, original, I, had, mine, for, a, couple, of, years]","[love, great, upgrade, original, I, mine, couple, years]"
3,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.,1,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]","[This, pillow, saved, back, I, love, look, feel, pillow]"
4,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I",1,"[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, I]","[Missing, information, use, great, product, price, I]"


In [16]:
#Apply Porter stemming

In [17]:
def apply_stemming(tokenized_column):
    """Return a list of tokens with Porter stemming applied.
    
    Args:
        column: Pandas dataframe column of tokenized data with stopwords removed.
    
    Returns:
        tokens (list): Tokenized list with words Porter stemmed.
    
    """
    
    stemmer = PorterStemmer() 
    return [stemmer.stem(word).lower() for word in tokenized_column]

In [18]:
df['porter_stemmed'] = df.apply(lambda x: apply_stemming(x['stopwords_removed']), axis=1)
df.head()

Unnamed: 0,category,rating,label,text,target,tokenized,stopwords_removed,porter_stemmed
0,category,rating,label,text_,0,[],[],[]
1,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty",1,"[Love, this, Well, made, sturdy, and, very, comfortable, I, love, it, Very, pretty]","[Love, Well, made, sturdy, comfortable, I, love, Very, pretty]","[love, well, made, sturdi, comfort, i, love, veri, pretti]"
2,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years",1,"[love, it, a, great, upgrade, from, the, original, I, had, mine, for, a, couple, of, years]","[love, great, upgrade, original, I, mine, couple, years]","[love, great, upgrad, origin, i, mine, coupl, year]"
3,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.,1,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]","[This, pillow, saved, back, I, love, look, feel, pillow]","[thi, pillow, save, back, i, love, look, feel, pillow]"
4,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I",1,"[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, I]","[Missing, information, use, great, product, price, I]","[miss, inform, use, great, product, price, i]"


In [19]:
#Rejoin words

In [20]:
def rejoin_words(tokenized_column):
    return ( " ".join(tokenized_column))

In [21]:
df['all_text'] = df.apply(lambda x: rejoin_words(x['porter_stemmed']), axis=1)
df[['all_text']].head()

Unnamed: 0,all_text
0,
1,love well made sturdi comfort i love veri pretti
2,love great upgrad origin i mine coupl year
3,thi pillow save back i love look feel pillow
4,miss inform use great product price i


In [22]:
X = df['all_text']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, shuffle=True)

In [23]:
classifiers = {}
classifiers.update({"XGBClassifier": XGBClassifier(eval_metric='logloss',objective='binary:logistic',)})
classifiers.update({"CatBoostClassifier": CatBoostClassifier(silent=True)})
classifiers.update({"LinearSVC": LinearSVC()})
classifiers.update({"MultinomialNB": MultinomialNB()})
classifiers.update({"LGBMClassifier": LGBMClassifier()})
classifiers.update({"RandomForestClassifier": RandomForestClassifier()})
classifiers.update({"DecisionTreeClassifier": DecisionTreeClassifier()})
classifiers.update({"ExtraTreeClassifier": ExtraTreeClassifier()})
classifiers.update({"AdaBoostClassifier": AdaBoostClassifier()})
classifiers.update({"KNeighborsClassifier": KNeighborsClassifier()})
classifiers.update({"RidgeClassifier": RidgeClassifier()})
classifiers.update({"SGDClassifier": SGDClassifier()})
classifiers.update({"BaggingClassifier": BaggingClassifier()})
classifiers.update({"BernoulliNB": BernoulliNB()})

In [24]:
df_models = pd.DataFrame(columns=['model', 'run_time', 'roc_auc', 'roc_auc_std'])

for key in classifiers:
    
    start_time = time.time()
    pipeline = Pipeline([("tfidf", TfidfVectorizer()), ("clf", classifiers[key] )])
    cv = cross_val_score(pipeline, X, y, cv=5, scoring='roc_auc')

    row = {'model': key,
           'run_time': format(round((time.time() - start_time)/60,2)),
           'roc_auc': cv.mean(),
           'roc_auc_std': cv.std(),
    }
    
    df_models = pd.concat([df_models, pd.DataFrame(row, index=[0])], ignore_index=True)
    
df_models = df_models.sort_values(by='roc_auc', ascending=False)

In [25]:
df_models


Unnamed: 0,model,run_time,roc_auc,roc_auc_std
11,SGDClassifier,0.05,0.924304,0.009017
1,CatBoostClassifier,11.14,0.922078,0.01046
2,LinearSVC,0.06,0.921062,0.013086
10,RidgeClassifier,0.06,0.920814,0.013822
4,LGBMClassifier,0.28,0.917614,0.011333
0,XGBClassifier,0.49,0.914757,0.010398
5,RandomForestClassifier,8.13,0.912112,0.015514
3,MultinomialNB,0.05,0.900591,0.02019
12,BaggingClassifier,16.78,0.857346,0.011597
8,AdaBoostClassifier,0.66,0.842195,0.021487


In [26]:
top_ten = df_models["model"].head(10).values.tolist()
top_ten

['SGDClassifier',
 'CatBoostClassifier',
 'LinearSVC',
 'RidgeClassifier',
 'LGBMClassifier',
 'XGBClassifier',
 'RandomForestClassifier',
 'MultinomialNB',
 'BaggingClassifier',
 'AdaBoostClassifier']

In [27]:
#Assess the selected model


In [28]:
for key in top_ten:
    pipeline = Pipeline([("tfidf", TfidfVectorizer()), ("clf", classifiers[key])])
    pipeline.fit(X_train, y_train)
    model_name = f"MODEL/{key}_model.pkl"
    pickle.dump(pipeline, open(model_name,'wb'))
#y_pred = pipeline.predict(X_test)

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

for i in top_ten:
    pipeline = pickle.load(open(f"MODEL/{i}_model.pkl", 'rb'))
    y_pred = pipeline.predict(X_test)
    print("----------------------------")
    print(i)
    print("----------------------------")
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    print('Accuracy:', acc)
    print('Precision:', prec)
    print('Recall:', rec)
    print('ROC/AUC:', roc_auc)


----------------------------
SGDClassifier
----------------------------
Accuracy: 0.8688726496428278
Precision: 0.8859289617486339
Recall: 0.8481281674023214
ROC/AUC: 0.868966756086512
----------------------------
CatBoostClassifier
----------------------------
Accuracy: 0.8629608342228426
Precision: 0.876056814338857
Recall: 0.8469838155958803
ROC/AUC: 0.8630333132746806
----------------------------
LinearSVC
----------------------------
Accuracy: 0.8747023565153133
Precision: 0.8737990555284155
Recall: 0.8772273990518228
ROC/AUC: 0.8746909017693953
----------------------------
RidgeClassifier
----------------------------
Accuracy: 0.8745381394203137
Precision: 0.8769508789222934
Recall: 0.8726499918260585
ROC/AUC: 0.8745467049199578
----------------------------
LGBMClassifier
----------------------------
Accuracy: 0.8560637162328598
Precision: 0.860899768441945
Recall: 0.8509073075036783
ROC/AUC: 0.8560871080573489
----------------------------
XGBClassifier
--------------------------