In [83]:
import pandas as pd
import numpy as np

# Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

### Load in the Labelled Dataset

In [84]:
dfR = pd.read_csv(r'dfRLabelled.csv')
dfR.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Product,Reviewer Name,Rating,Text,Date,Country,ASIN,Language,Rules,Label
0,0,0,"Wireless Earbuds, yobola Bluetoth Earbuds, IPX...",Will of Burr,1.0 out of 5 stars,"I am a real fan of headphones, probably becaus...",11/17/2022,United States 🇺🇸,B09LCVHCVX,en,"3, 4, 5, 7 :",Suspicious
1,1,1,"Wireless Earbuds, yobola Bluetoth Earbuds, IPX...",Allen Pak,5.0 out of 5 stars,To be honest. I wasn't expecting much when I ...,10/25/2022,United States 🇺🇸,B09LCVHCVX,en,,Not Suspicious
2,2,2,"Wireless Earbuds, yobola Bluetoth Earbuds, IPX...",brenda,5.0 out of 5 stars,1st off they connected very easily which is al...,11/29/2022,United States 🇺🇸,B09LCVHCVX,en,"3, 4, 6 :",Suspicious


In [85]:
dfR.shape

(2055, 12)

### Remove Reviews with No Text

In [86]:
dfR = dfR[dfR['Text'].notna()]
dfR.shape

(2011, 12)

### Determine Baseline Accuracy

In [87]:
dfR.Label.value_counts()

Not Suspicious    1525
Suspicious         486
Name: Label, dtype: int64

In [88]:
acc_baseline = 486/(1525+486)
acc_baseline

0.2416708105420189

### Split on Suspicious vs Not Suspicious

In [89]:
# In the first step we will split the data in training and remaining dataset
dfS = dfR[dfR.Label == 'Suspicious']
dfNS = dfR[dfR.Label == 'Not Suspicious']

### tf-idf Suspicious

In [90]:
# construct tf-idf vectors
Vectorizer = TfidfVectorizer(analyzer='word',stop_words='english', ngram_range=(1,1))
tfidf      = Vectorizer.fit_transform(dfS.Text)
vocabulary = Vectorizer.get_feature_names_out()

In [91]:
tfidf.shape

(486, 1515)

In [92]:
word_counts = pd.DataFrame.sparse.from_spmatrix(tfidf,columns=vocabulary)
print(word_counts.shape)
word_counts.head()

(486, 1515)


Unnamed: 0,00,10,100,10x,12,13,14,17,18,1960,...,year,years,yes,yields,yikes,yobola,yobolaa,youtube,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.069912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.124973,0.0,0.0,0.0,0.0


In [93]:
# count words
print('top ten words:')
word_counts.sum().sort_values(ascending=False).head(10)

top ten words:


good         46.572053
quality      31.539478
great        29.324432
price        25.650230
product      23.551763
sound        18.256422
love         17.057482
excellent    15.238749
money        13.456268
earbuds      13.408854
dtype: float64

In [94]:
len(vocabulary)

1515

### tf-idf Not Suspicious

In [95]:
# construct tf-idf vectors
Vectorizer = TfidfVectorizer(analyzer='word',stop_words='english', ngram_range=(1,1))
tfidf      = Vectorizer.fit_transform(dfNS.Text)
vocabulary = Vectorizer.get_feature_names_out()

In [96]:
tfidf.shape

(1525, 2725)

In [97]:
word_counts = pd.DataFrame.sparse.from_spmatrix(tfidf,columns=vocabulary)
print(word_counts.shape)
word_counts.head()

(1525, 2725)


Unnamed: 0,00,10,100,11,12,120,13,14,14k,15,...,yobola,yobolat9,young,younger,youtube,yrs,zaa,zero,zone,zoom
0,0.0,0.0,0.151877,0.0,0.0,0.0,0.0,0.0,0.0,0.088982,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.24252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [98]:
# count words
print('top ten words:')
word_counts.sum().sort_values(ascending=False).head(10)

top ten words:


great      93.665413
good       89.611787
sound      87.455496
quality    82.827552
price      62.688733
battery    50.112931
earbuds    47.164834
use        44.676958
ear        44.616973
life       41.333399
dtype: float64

In [99]:
len(vocabulary)

2725

### Key Takeaways

##### Ways in which the results reinforce the labelling rules

Suspicious reviews use strong emotional language such as 'love' and 'excellent'. They appear to hype up the product through this language.

Suspicious reviews use generic language to desribe the product such as 'product'.

Not Suspicious reviews reference specific elements of the product more often such as the 'sound' and 'battery life'.

Not Suspicious reviews use personal language displaying that they have used the product such as 'use'.