# Naive Bayes

## 1 = positive
## 0 = negative

In [None]:
import pandas as pd

df_train = pd.read_json("../data/aclIMDB_train.json", orient="records")
df_train.info()
df_train.hist()
df_train.head()

In [None]:
df_val = pd.read_json("../data/aclIMDB_val.json", orient="records")
df_val.info()
df_val.hist()
df_val.head()

In [None]:
df_test = pd.read_json("../data/aclIMDB_test.json", orient="records")
df_test.info()
df_test.hist()
df_test.head()

In [None]:
def sentiment(value):
    if(value == 1):
        return "positive"
    else:
        return "negative"

df_train["sentiment"] = [None] * len(df_train)
df_train["sentiment"] = df_train["label"].apply(sentiment)
df_val["sentiment"] = [None] * len(df_test)
df_val["sentiment"] = df_val["label"].apply(sentiment)
df_test["sentiment"] = [None] * len(df_test)
df_test["sentiment"] = df_test["label"].apply(sentiment)

In [None]:
df_train[df_train["label"] == 1][:1000]

In [None]:
df_train.shape

## Pre-processing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')

STOPWORDS = stopwords.words("english")

STOPWORDS[:10]

In [None]:

def is_html_tag(word):
    w = word.replace("\n", "")
    return w.startswith("<") or w.startswith(">") or w.startswith("/") or w.strip()[:2] == "br"

def remove_html_tags(sentence):
    #separate_tags = sentence.replace("<", " < ")
    #separate_tags = separate_tags.replace(">", " > ")

    single_spaces = " ".join(str(sentence).split())

    return " ".join([token for token in single_spaces.split(" ") if not is_html_tag(token)])

In [None]:
remove_html_tags("""
<html>
<body
    <h1> <br> Hello, there br> </h1>
    <p> General Kenobi </p>
</body>
</html>
""")

In [None]:
df_train["text"].head(5)

In [None]:
df_train["text"] = df_train["text"].apply(remove_html_tags)

In [None]:
df_train["text"].head(5)

In [None]:
nltk.download('punkt')

tokenized_reviews = df_train["text"].apply(lambda review_text: word_tokenize(review_text.lower()))

tokenized_reviews.head()

In [None]:
d = dict()

for review in tokenized_reviews:
    for word in review:
        if word not in STOPWORDS and word.isalpha():
            d[word] = d.get(word, 0) + 1
            

In [None]:
d["br"]

In [None]:
del d["br"]

In [None]:
DESIRED_VOCAB_SIZE = 4000


VOCAB = [k for k,v in sorted(d.items(), key=lambda item: item[1], reverse=True)[:DESIRED_VOCAB_SIZE]]

In [None]:
VOCAB[:10]

In [None]:
word_table = pd.DataFrame({"word": VOCAB})
word_table.to_json("../data/vocab.json", orient="records")

In [None]:
word_table = pd.read_json("../data/vocab.json", orient="records")
word_table.info()
word_table.head(10)

## Naive Bayes

In [None]:
dict_freqs = {"positive": {}, "negative": {}}

In [None]:
VOCAB_IDX = {}
for i in range(0, len(word_table["word"].values)):
    VOCAB_IDX[word_table["word"].values[i]] = i

for idx in range(df_train.shape[0]):
    review = df_train.iloc[idx]["text"]
    sentiment = df_train.iloc[idx]["sentiment"]
    
    for word in review.split(" "):
        if word in VOCAB_IDX:
            dict_freqs[sentiment][word] = dict_freqs[sentiment].get(word, 0) + 1

In [None]:
dict_freqs["positive"]["good"]

In [None]:
dict_freqs["negative"]["good"]

In [None]:
dict_freqs["negative"]["bad"]

In [None]:
dict_freqs["positive"]["bad"]

In [None]:
word_table.head()

In [None]:
total_positive = sum(dict_freqs["positive"].values())

word_table["positive"] = [(dict_freqs["positive"].get(w, 0) + 1) / (total_positive + len(VOCAB))  for w in word_table["word"]]

In [None]:
total_negative = sum(dict_freqs["negative"].values())

word_table["negative"] = [(dict_freqs["negative"].get(w, 0) + 1) / (total_negative + len(VOCAB))  for w in word_table["word"]]

In [None]:
word_table.head()

In [None]:
import numpy as np

word_table["ratio"] = np.log(word_table["positive"] / word_table["negative"])

In [None]:
word_table.head()

In [None]:
word_table = word_table.set_index("word")

In [None]:
word_table.head()

In [None]:
word_table["ratio"].describe()

In [None]:
word_table["word"] = word_table.index
word_table.head()

In [None]:
word_table.to_json("../data/word_table.json",orient="records")

## Predicting

In [None]:
import pandas as pd

word_table = pd.read_json("../data/word_table.json", orient="records")
word_table.head()

In [None]:
word_table = word_table.set_index("word")
word_table.head()

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

STOPWORDS = stopwords.words("english")

def is_html_tag(word):
    w = word.replace("\n", "")
    return w.startswith("<") or w.startswith(">") or w.startswith("/") or w.strip()[:2] == "br"

def remove_html_tags(sentence):
    #separate_tags = sentence.replace("<", " < ")
    #separate_tags = separate_tags.replace(">", " > ")

    single_spaces = " ".join(str(sentence).split())

    return " ".join([token for token in single_spaces.split(" ") if not is_html_tag(token)])

def predict_for_review_raw(review):
    _input = remove_html_tags(review)
    _input = word_tokenize(_input.lower())

    word_table_words = word_table.index

    return sum([word_table["ratio"].loc[token] for token in _input if token in word_table_words])

In [None]:
predict_for_review_raw("This movie sucks.")

In [None]:
predict_for_review_raw("This movie was fantastic!")

In [None]:
def predict_for_review(review):
    return int(predict_for_review_raw(review) > 0)

In [None]:
preds = df_train["text"].apply(predict_for_review)

preds

In [None]:
real = (df_train["sentiment"] == "positive").astype(int)

In [None]:
def get_accuracy(preds, real):
    return sum(preds == real) / len(real)

In [None]:
print(f"Training set accuracy: {get_accuracy(preds, real)}")

In [None]:
preds_val = df_val["text"].apply(predict_for_review)
real_val = (df_val["sentiment"] == "positive").astype(int)
print(f"Validation set accuracy: {get_accuracy(preds_val, real_val)}")

In [None]:
preds_test = df_test["text"].apply(predict_for_review)
real_test = (df_test["sentiment"] == "positive").astype(int)
print(f"Test set accuracy: {get_accuracy(preds_test, real_test)}")