In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import string, nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\16692\AppData\Roaming\nltk_data...


True

In [2]:
df = pd.read_csv('fake reviews dataset.csv')
df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [3]:
df.dropna(inplace=True)

In [8]:
def clean_text(text):
    nopunc = [w for w in text if w not in string.punctuation]
    nopunc = ''.join(nopunc)
    return  ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])

In [9]:
df['text_'][0], clean_text(df['text_'][0])

('Love this!  Well made, sturdy, and very comfortable.  I love it!Very pretty',
 'Love Well made sturdy comfortable love itVery pretty')

In [10]:
df['text_'] = df['text_'].astype(str)

In [11]:
def preprocess(text):
    return ' '.join([word for word in word_tokenize(text) if word not in stopwords.words('english') and not word.isdigit() and word not in string.punctuation])

In [12]:
df['text_'][:10000] = df['text_'][:10000].apply(preprocess)

In [13]:
df['text_'][10001:20000] = df['text_'][10001:20000].apply(preprocess)

In [14]:
df['text_'][20001:30000] = df['text_'][20001:30000].apply(preprocess)

In [15]:
df['text_'][30001:40000] = df['text_'][30001:40000].apply(preprocess)

In [16]:
df['text_'][40001:40432] = df['text_'][40001:40432].apply(preprocess)

In [17]:
df['text_'] = df['text_'].str.lower()

In [18]:
stemmer = PorterStemmer()
def stem_words(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])
df['text_'] = df['text_'].apply(lambda x: stem_words(x))

In [19]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
df["text_"] = df["text_"].apply(lambda text: lemmatize_words(text))

In [20]:
df.head()

Unnamed: 0,category,rating,label,text_,length
0,Home_and_Kitchen_5,5.0,CG,love well made sturdi comfort i love veri pretti,75
1,Home_and_Kitchen_5,5.0,CG,love great upgrad origin i 've mine coupl year,80
2,Home_and_Kitchen_5,5.0,CG,thi pillow save back i love look feel pillow,67
3,Home_and_Kitchen_5,1.0,CG,miss inform use great product price i,81
4,Home_and_Kitchen_5,5.0,CG,veri nice set good qualiti we set two month,85


In [21]:
df.dropna(inplace=True)

In [22]:
df['length'] = df['text_'].apply(len)

In [23]:
def text_process(review):
    nopunc = [char for char in review if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [24]:
bow_transformer = CountVectorizer(analyzer=text_process)
bow_transformer.fit(df['text_'])
print("Total Vocabulary:",len(bow_transformer.vocabulary_))

Total Vocabulary: 34450


In [32]:
import pickle

with open('bow.pkl', 'wb') as outp:
    pickle.dump(bow_transformer, outp, pickle.HIGHEST_PROTOCOL)


In [26]:
bow_reviews = bow_transformer.transform(df['text_'])

In [29]:
pd.DataFrame(bow_reviews).to_csv("bagofwords.csv")

In [30]:
print("Shape of Bag of Words Transformer for the entire reviews corpus:",bow_reviews.shape)
print("Amount of non zero values in the bag of words model:",bow_reviews.nnz)

Shape of Bag of Words Transformer for the entire reviews corpus: (40432, 34450)
Amount of non zero values in the bag of words model: 1013898


In [33]:
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(bow_reviews)

with open('tfidf.pkl', 'wb') as outpidf:
    pickle.dump(tfidf_transformer, outpidf, pickle.HIGHEST_PROTOCOL)




<40432x34450 sparse matrix of type '<class 'numpy.float64'>'
	with 1013898 stored elements in Compressed Sparse Row format>

In [34]:
tfidf_reviews = tfidf_transformer.transform(bow_reviews)
print("Shape:",tfidf_reviews.shape)
print("No. of Dimensions:",tfidf_reviews.ndim)

Shape: (40432, 34450)
No. of Dimensions: 2


In [56]:
review_train, review_test, label_train, label_test = train_test_split(df['text_'],df['label'],test_size=0.35)

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',RandomForestClassifier())
])

In [39]:
pipeline.fit(review_train,label_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x000002783CAFE790>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', RandomForestClassifier())])

In [40]:
rfc_pred = pipeline.predict(review_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print('Classification Report:',classification_report(label_test,rfc_pred))
print('Confusion Matrix:',confusion_matrix(label_test,rfc_pred))
print('Accuracy Score:',accuracy_score(label_test,rfc_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,rfc_pred)*100,2)) + '%')

In [43]:
import pickle

with open('pipeline.pkl', 'wb') as out_pipeline:
    pickle.dump(pipeline, out_pipeline, pickle.HIGHEST_PROTOCOL)


In [44]:
import joblib
joblib.dump(pipeline,"pipeline2.sav")

['pipeline2.sav']

In [54]:
cv=CountVectorizer(analyzer=text_process)
tfidf = TfidfTransformer()
classifier = RandomForestClassifier()

In [57]:
review_train = cv.fit_transform(review_train)
review_train = tfidf.fit_transform(review_train)
classifier.fit(review_train,label_train)

RandomForestClassifier()

In [58]:
review_test = cv.transform(review_test)
review_test = tfidf.transform(review_test)
rfc_pred = classifier.predict(review_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print('Classification Report:',classification_report(label_test,rfc_pred))
print('Confusion Matrix:',confusion_matrix(label_test,rfc_pred))
print('Accuracy Score:',accuracy_score(label_test,rfc_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,rfc_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.81      0.88      0.84      7071
          OR       0.87      0.79      0.83      7081

    accuracy                           0.84     14152
   macro avg       0.84      0.84      0.84     14152
weighted avg       0.84      0.84      0.84     14152

Confusion Matrix: [[6254  817]
 [1484 5597]]
Accuracy Score: 0.837408140192199
Model Prediction Accuracy: 83.74%


In [60]:
with open('cv_final.pkl', 'wb') as out_pipeline:
    pickle.dump(cv, out_pipeline, pickle.HIGHEST_PROTOCOL)
with open('tfidf_final.pkl', 'wb') as out_pipeline:
    pickle.dump(tfidf, out_pipeline, pickle.HIGHEST_PROTOCOL)
with open('classifier.pkl', 'wb') as out_pipeline:
    pickle.dump(classifier, out_pipeline, pickle.HIGHEST_PROTOCOL)
