In [389]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, precision_score, f1_score
import pickle

from_pickle = True

# Dataset prep

In [390]:
if not from_pickle :
    # Load Gibberish dataset
    columns_gibberish = ["Response", "Label"]
    path_to_import_gibberish = "./dataset/Gibberish.csv"
    df_gibberish = pd.read_csv(path_to_import_gibberish, usecols=columns_gibberish, sep=',', encoding="ISO-8859-1")
    df_gibberish_count = len(df_gibberish)
    df_gibberish.tail()

In [391]:
if not from_pickle :
    # Load Amazon reviews dataset
    path_to_import_amazon = "./dataset/Amazon.csv"
    df_amazon = pd.read_csv(path_to_import_amazon, encoding="ISO-8859-1")
    df_amazon = df_amazon.sample(n = df_gibberish_count)
    df_amazon.tail()

In [392]:
if not from_pickle :
    # Name Columns in Amazon dataset
    df_amazon.drop(df_amazon.columns[0], inplace=True, axis=1)
    df_amazon.columns = ["Response"]
    df_amazon["Label"] = 0
    df_amazon.tail()

In [393]:
if not from_pickle :
    # Remove title from text
    def remove_intro(x):
        if x.find(":") < 0:
            return x
        else:
            return x[x.find(":") + 1:len(x)].strip()

    # Remove Title from amazon reviews
    df_amazon["Response"] = df_amazon["Response"].apply(remove_intro)


In [394]:
if not from_pickle :
    # Create feature list
    vectorizer = CountVectorizer(stop_words='english')
    all_features = vectorizer.fit_transform(df_amazon.Response)
    all_features.shape
    # vectorizer.vocabulary_


In [395]:
# Merge both datasets
if not from_pickle :
    df_merged = pd.read_pickle('./dataset/preparated_data.pkl')
else:
    df_merged = pd.concat([df_amazon, df_gibberish], ignore_index=True, sort=False)
    df_merged.to_pickle("./dataset/preparated_data.pkl",protocol=pickle.HIGHEST_PROTOCOL)

df_merged = df_merged.sample(n=all_features.shape[0])
df_merged.tail()

Unnamed: 0,Response,Label
396,It doesn't work. It softens skin and that is a...,0
1907,I have never seen such a beautiful amalgamatio...,0
3187,This show was way over rated by Critics and I'...,0
486,This was a great movie to watch with my 14 yea...,0
4353,kjuhyfvguilo jo;ikl jnmiol hnoil uhjyiukj hyui...,1


# Dataset analyse

In [396]:
# Train dataset
x_train, x_test, y_train, y_test, = train_test_split(all_features, df_merged.Label, test_size=0.20)
classifier = MultinomialNB()  # Create Model
classifier.fit(x_train, y_train)  # Train Model

MultinomialNB()

In [397]:
# Calculate accuracy

# My own calculation
nr_correct = (y_test == classifier.predict(x_test)).sum()
print(f'{nr_correct} correctly predicted')
nr_incorrect = y_test.size - nr_correct
print(f'{nr_incorrect} incorrectly predicted')

fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)
print(f'The testing accuracy of the model is {1-fraction_wrong:.2}%')

386 correctly predicted
371 incorrectly predicted
The testing accuracy of the model is 0.51%


In [398]:
# Calculation by sklearn
classifier.score(x_test, y_test)

0.5099075297225891

In [399]:
recall_score(y_test, classifier.predict(x_test))

0.5331632653061225

In [400]:
precision_score(y_test, classifier.predict(x_test))

0.5264483627204031

In [401]:
f1_score(y_test, classifier.predict(x_test))


0.5297845373891001

In [402]:
test_data = [
    '23sadfkla2145nla',
    'hello my name is rik, this is just normal text nothing wrong with it. Lets see what the naive bayes will show',
    'the product was absolutely amazing it has everything i need',
    '1',
    '2',
    'We like having a strange assortment of reading material in our main floor bathroom. This one fit right in the collection.',
    'ddddddddddddddddddddddd',
    'ndjty yuke6u7yk rtrwtj kmt hjw yjwetyjwety',
    'sdbfdgn',
    ''
]

In [403]:
doc_term_matrix = vectorizer.transform(test_data)
classifier.predict(doc_term_matrix)

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0], dtype=int64)