In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore') 

In [2]:
data=pd.read_csv("modified_data.csv")
data.head(5)

Unnamed: 0.1,Unnamed: 0,reviews.text,reviews.doRecommend
0,0,This product is fast a stylish. Good battery l...,True
1,1,Previous owner of an older kindle. Love the to...,True
2,2,"Super Slow, Would not download, Returned after...",False
3,3,Gave as a Christmas gift. Don't know how good ...,False
4,4,I LOVE THIS ECHO AND ITS HANDS FREE. LOUD SPEA...,True


In [3]:
data.pop("Unnamed: 0")

0            0
1            1
2            2
3            3
4            4
         ...  
10944    10944
10945    10945
10946    10946
10947    10947
10948    10948
Name: Unnamed: 0, Length: 10949, dtype: int64

# Preparing DataSet

In [4]:
data["reviews.doRecommend"].value_counts()

True     9565
False    1384
Name: reviews.doRecommend, dtype: int64

In [5]:
def fake(dorecommend):
    if dorecommend==True:
        return 0
    else:
        return 1

In [6]:
x=data["reviews.text"]
y=data["reviews.doRecommend"].apply(fake)

In [7]:
y.value_counts()

0    9565
1    1384
Name: reviews.doRecommend, dtype: int64

# Preparing Training and Test data

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.4,random_state = 0)

# Feature Extraction of Text

In [9]:
import nltk
import string
from nltk.corpus import stopwords

nltk.download('stopwords') 

def remove_punctuation_marks(text) :
    punctuation_marks = dict((ord(punctuation_mark), None) for punctuation_mark in string.punctuation)
    return text.translate(punctuation_marks)

def get_lemmatized_tokens(text) :
    normalized_tokens = nltk.word_tokenize(remove_punctuation_marks(text.lower()))
    return [nltk.stem.WordNetLemmatizer().lemmatize(normalized_token) for normalized_token in normalized_tokens]
    


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ranti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_transformer = TfidfVectorizer(tokenizer = get_lemmatized_tokens, lowercase=True, stop_words = stopwords.words('english'))
x_train_tfidf = tfidf_transformer.fit_transform(x_train)
x_train_tfidf.shape

(6569, 7123)

# Classification

In [11]:
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

# MultinomialNB

In [12]:
from sklearn.naive_bayes import MultinomialNB 
tfidf_multiNB_pipe = Pipeline([ 
                             ("tfidf", tfidf_transformer),
                             ("clf_nominalNB", MultinomialNB())])
tfidf_multiNB_pipe.fit(x_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 tokenizer=<function get_lemmatized_tokens at 0x000000000BBFB4C0>)),
                ('clf_nominalNB', MultinomialNB())])

In [13]:
import numpy as np
prediction = tfidf_multiNB_pipe.predict(x_test)
np.mean(prediction == y_test)

0.8778538812785388

In [14]:
print(classification_report(y_test, 
                            prediction))

              precision    recall  f1-score   support

           0       0.88      1.00      0.93      3843
           1       0.75      0.01      0.01       537

    accuracy                           0.88      4380
   macro avg       0.81      0.50      0.47      4380
weighted avg       0.86      0.88      0.82      4380



In [15]:
pd.DataFrame(prediction).value_counts()

0    4376
1       4
dtype: int64

# Linear SVC

In [16]:
from sklearn.svm import LinearSVC

clf_svm_pipe = Pipeline([("vect", tfidf_transformer), 
                            ("clf_svc", LinearSVC())])
clf_svm_pipe.fit(x_train, y_train)
                            
prediction=clf_svm_pipe.predict(x_test)                 
np.mean(prediction == y_test)

0.9031963470319635

In [17]:
print(classification_report(y_test, 
                            prediction))

              precision    recall  f1-score   support

           0       0.92      0.97      0.95      3843
           1       0.66      0.42      0.52       537

    accuracy                           0.90      4380
   macro avg       0.79      0.70      0.73      4380
weighted avg       0.89      0.90      0.89      4380



In [18]:
pd.DataFrame(prediction).value_counts()

0    4037
1     343
dtype: int64

# Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

clf_logReg_pipe = Pipeline([("vect", tfidf_transformer), 
                            ("clf_logReg", LogisticRegression())])
clf_logReg_pipe.fit(x_train, y_train)
                            
prediction=clf_logReg_pipe.predict(x_test)                 
np.mean(prediction == y_test)

0.897945205479452

In [20]:
print(classification_report(y_test, 
                            prediction))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94      3843
           1       0.82      0.22      0.34       537

    accuracy                           0.90      4380
   macro avg       0.86      0.60      0.64      4380
weighted avg       0.89      0.90      0.87      4380



In [21]:
pd.DataFrame(prediction).value_counts()

0    4238
1     142
dtype: int64

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier 
clf_decision_pipe = Pipeline([("vect", tfidf_transformer), 
                            ("clf_dec", DecisionTreeClassifier())])
clf_decision_pipe.fit(x_train, y_train)
                            
prediction=clf_decision_pipe.predict(x_test)                 
np.mean(prediction == y_test)

In [None]:
print(classification_report(y_test, 
                            prediction))

In [None]:
pd.DataFrame(prediction).value_counts()

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_random_pipe = Pipeline([("vect", tfidf_transformer), 
                            ("clf_ran", RandomForestClassifier())])
clf_random_pipe.fit(x_train, y_train)
                            
prediction=clf_random_pipe.predict(x_test)                 
np.mean(prediction == y_test)

In [None]:
print(classification_report(y_test, 
                            prediction))

In [None]:
pd.DataFrame(prediction).value_counts()

# Final Model

In [None]:
model = Pipeline([("vect", tfidf_transformer), 
                            ("clf_dec", DecisionTreeClassifier())])
model.fit(x_train, y_train)

In [None]:
import joblib

In [None]:
joblib.dump(model, "fake.pkl")