In [1]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, precision_score, f1_score
import pickle

from_pickle = True

# Dataset prep

In [2]:
if not from_pickle :
    # Load Gibberish dataset
    columns_gibberish = ["Response", "Label"]
    path_to_import_gibberish = "./dataset/Gibberish.csv"
    df_gibberish = pd.read_csv(path_to_import_gibberish, usecols=columns_gibberish, sep=',', encoding="ISO-8859-1")
    df_gibberish_count = len(df_gibberish)
    df_gibberish.tail()
    df_gibberish.to_pickle('./dataset/gibberish.pkl')
else:
    df_gibberish = pd.read_pickle('./dataset/gibberish.pkl')

In [3]:
if not from_pickle :
    # Load Amazon reviews dataset
    path_to_import_amazon = "./dataset/Amazon.csv"
    df_amazon = pd.read_csv(path_to_import_amazon, encoding="ISO-8859-1")
    df_amazon = df_amazon.sample(n = 3783)
    df_amazon.tail()
    df_amazon.to_pickle('./dataset/amazon.pkl')
else :
    df_amazon = pd.read_pickle('./dataset/amazon.pkl')

In [4]:
# Name Columns in Amazon dataset
df_amazon.drop(df_amazon.columns[0], inplace=True, axis=1)
df_amazon.columns = ["Response"]
df_amazon["Label"] = 0
df_amazon.tail()

Unnamed: 0,Response,Label
77096,Loved It!: Everytime I see this movie it touch...,0
182753,"Undercooked, Overrated: Aside from ""North Amer...",0
540146,Dissappointed: Unable to view the movie wound ...,0
993767,Cool new techie toy!: If you travel a lot then...,0
492951,Works well: These work great & get the job don...,0


In [5]:
# Remove title from text
def remove_intro(x):
    if x.find(":") < 0:
        return x
    else:
        return x[x.find(":") + 1:len(x)].strip()

# Remove Title from amazon reviews
df_amazon["Response"] = df_amazon["Response"].apply(remove_intro)


In [6]:
# Create feature list
vectorizer = CountVectorizer(stop_words='english')
all_features = vectorizer.fit_transform(df_amazon.Response)
all_features.shape
# vectorizer.vocabulary_


(3783, 19144)

In [7]:
# Merge both datasets
df_merged = pd.concat([df_amazon, df_gibberish], ignore_index=True, sort=False)
df_merged = df_merged.sample(n=all_features.shape[0])
df_merged.tail()

Unnamed: 0,Response,Label
2821,i must say this book initiated feelings i had ...,0
485,I just got the v60i and really love the phone....,0
2935,Time To Say GoodBye was my first experience an...,0
2472,"I really like this bible, and am glad that I w...",0
3523,Bought this last spring due to the great revie...,0


# Dataset analyse

In [8]:
# Train dataset
x_train, x_test, y_train, y_test, = train_test_split(all_features, df_merged.Label, test_size=0.20)
classifier = MultinomialNB()  # Create Model
classifier.fit(x_train, y_train)  # Train Model

MultinomialNB()

In [9]:
# Calculate accuracy

# My own calculation
nr_correct = (y_test == classifier.predict(x_test)).sum()
print(f'{nr_correct} correctly predicted')
nr_incorrect = y_test.size - nr_correct
print(f'{nr_incorrect} incorrectly predicted')

fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)
print(f'The testing accuracy of the model is {1-fraction_wrong:.2}%')

377 correctly predicted
380 incorrectly predicted
The testing accuracy of the model is 0.5%


In [10]:
# Calculation by sklearn
classifier.score(x_test, y_test)

0.49801849405548215

In [11]:
recall_score(y_test, classifier.predict(x_test))

0.4910941475826972

In [12]:
precision_score(y_test, classifier.predict(x_test))

0.517426273458445

In [13]:
f1_score(y_test, classifier.predict(x_test))


0.5039164490861618

In [14]:
test_data = [
    '23sadfkla2145nla',
    'hello my name is rik, this is just normal text nothing wrong with it. Lets see what the naive bayes will show',
    'the product was absolutely amazing it has everything i need',
    '1',
    '2',
    'We like having a strange assortment of reading material in our main floor bathroom. This one fit right in the collection.',
    'ddddddddddddddddddddddd',
    'ndjty yuke6u7yk rtrwtj kmt hjw yjwetyjwety',
    'sdbfdgn',
    ''
]

In [15]:
doc_term_matrix = vectorizer.transform(test_data)
classifier.predict(doc_term_matrix)

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1], dtype=int64)