In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore') 

In [2]:
data=pd.read_csv("modified_data.csv")
data.head(5)

Unnamed: 0.1,Unnamed: 0,reviews.text,reviews.doRecommend
0,0,"I love reading my books on this, and the Alexa...",True
1,1,It's good to have this box if you like to watc...,True
2,2,"It is ok to be a ""android"" tablet with such a ...",False
3,3,The battery on this device cannot handle true ...,True
4,4,I bought 2 of these on Black Friday and these ...,False


# Preparing DataSet

In [3]:
data["reviews.doRecommend"].value_counts()

True     11170
False     1384
Name: reviews.doRecommend, dtype: int64

In [4]:
def fake(dorecommend):
    if dorecommend==True:
        return 0
    else:
        return 1

In [5]:
x=data["reviews.text"]
y=data["reviews.doRecommend"].apply(fake)

In [6]:
y.value_counts()

0    11170
1     1384
Name: reviews.doRecommend, dtype: int64

# Preparing Training and Test data

In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.4,random_state = 0)

# Feature Extraction of Text

In [8]:
import nltk
import string
from nltk.corpus import stopwords

nltk.download('stopwords') 

def remove_punctuation_marks(text) :
    punctuation_marks = dict((ord(punctuation_mark), None) for punctuation_mark in string.punctuation)
    return text.translate(punctuation_marks)

def get_lemmatized_tokens(text) :
    normalized_tokens = nltk.word_tokenize(remove_punctuation_marks(text.lower()))
    return [nltk.stem.WordNetLemmatizer().lemmatize(normalized_token) for normalized_token in normalized_tokens]
    


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ranti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_transformer = TfidfVectorizer(tokenizer = get_lemmatized_tokens, lowercase=True, stop_words = stopwords.words('english'))
x_train_tfidf = tfidf_transformer.fit_transform(x_train)
x_train_tfidf.shape

(7532, 7389)

# Classification

In [10]:
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

# MultinomialNB

In [11]:
from sklearn.naive_bayes import MultinomialNB 
tfidf_multiNB_pipe = Pipeline([ 
                             ("tfidf", tfidf_transformer),
                             ("clf_nominalNB", MultinomialNB())])
tfidf_multiNB_pipe.fit(x_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function get_lemmatized_tokens at 0x000000000B9E03A0>)),
                ('clf_nominalNB', MultinomialNB())])

In [12]:
import numpy as np
prediction = tfidf_multiNB_pipe.predict(x_test)
np.mean(prediction == y_test)

0.8870967741935484

In [13]:
print(classification_report(y_test, 
                            prediction))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94      4454
           1       1.00      0.00      0.00       568

    accuracy                           0.89      5022
   macro avg       0.94      0.50      0.47      5022
weighted avg       0.90      0.89      0.83      5022



In [14]:
pd.DataFrame(prediction).value_counts()

0    5021
1       1
dtype: int64

# Linear SVC

In [15]:
from sklearn.svm import LinearSVC

clf_svm_pipe = Pipeline([("vect", tfidf_transformer), 
                            ("clf_svc", LinearSVC())])
clf_svm_pipe.fit(x_train, y_train)
                            
prediction=clf_svm_pipe.predict(x_test)                 
np.mean(prediction == y_test)

0.9093986459577857

In [16]:
print(classification_report(y_test, 
                            prediction))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95      4454
           1       0.67      0.39      0.49       568

    accuracy                           0.91      5022
   macro avg       0.80      0.68      0.72      5022
weighted avg       0.90      0.91      0.90      5022



In [17]:
pd.DataFrame(prediction).value_counts()

0    4693
1     329
dtype: int64

# Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression

clf_logReg_pipe = Pipeline([("vect", tfidf_transformer), 
                            ("clf_logReg", LogisticRegression())])
clf_logReg_pipe.fit(x_train, y_train)
                            
prediction=clf_logReg_pipe.predict(x_test)                 
np.mean(prediction == y_test)

0.9026284348864994

In [19]:
print(classification_report(y_test, 
                            prediction))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95      4454
           1       0.83      0.17      0.29       568

    accuracy                           0.90      5022
   macro avg       0.87      0.58      0.62      5022
weighted avg       0.90      0.90      0.87      5022



In [20]:
pd.DataFrame(prediction).value_counts()

0    4903
1     119
dtype: int64

# Decision Tree Classifier

In [21]:
from sklearn.tree import DecisionTreeClassifier 
clf_decision_pipe = Pipeline([("vect", tfidf_transformer), 
                            ("clf_dec", DecisionTreeClassifier())])
clf_decision_pipe.fit(x_train, y_train)
                            
prediction=clf_decision_pipe.predict(x_test)                 
np.mean(prediction == y_test)

0.8665870171246516

In [22]:
print(classification_report(y_test, 
                            prediction))

              precision    recall  f1-score   support

           0       0.92      0.93      0.93      4454
           1       0.40      0.36      0.38       568

    accuracy                           0.87      5022
   macro avg       0.66      0.65      0.65      5022
weighted avg       0.86      0.87      0.86      5022



In [23]:
pd.DataFrame(prediction).value_counts()

0    4506
1     516
dtype: int64

# Random Forest Classifier

In [24]:
from sklearn.ensemble import RandomForestClassifier

clf_random_pipe = Pipeline([("vect", tfidf_transformer), 
                            ("clf_ran", RandomForestClassifier())])
clf_random_pipe.fit(x_train, y_train)
                            
prediction=clf_random_pipe.predict(x_test)                 
np.mean(prediction == y_test)

0.8928713659896456

In [25]:
print(classification_report(y_test, 
                            prediction))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94      4454
           1       0.73      0.08      0.15       568

    accuracy                           0.89      5022
   macro avg       0.81      0.54      0.55      5022
weighted avg       0.88      0.89      0.85      5022



In [26]:
pd.DataFrame(prediction).value_counts()

0    4958
1      64
dtype: int64

# Final Model

In [27]:
model = Pipeline([("vect", tfidf_transformer), 
                            ("clf_dec", DecisionTreeClassifier())])
model.fit(x_train, y_train)

Pipeline(steps=[('vect',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function get_lemmatized_tokens at 0x000000000B9E03A0>)),
                ('clf_dec', DecisionTreeClassifier())])

In [28]:
import joblib

In [29]:
joblib.dump(model, "fake.pkl")

['fake.pkl']