In [None]:
# Naive Bayes without library

from __future__ import unicode_literals

import re
import string

import numpy as np
import pandas as pd

from hazm import *
from cleantext import clean

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df['comment'] = train_df['comment'].fillna('XXXXXXXX')
test_df['comment'] = test_df['comment'].fillna('XXXXXXXX')
train_df['title'] = train_df['title'].fillna('XXXXXXXX')
test_df['title'] = test_df['title'].fillna('XXXXXXXX')

train_spam_count = train_df.query('verification_status==0')['verification_status'].count()
train_ham_count = train_df.query('verification_status==1')['verification_status'].count()

In [None]:
with open(r'stop_words') as f:
    stop_words = f.read().splitlines()

In [None]:
def word_extraction(sentence):
    normalizer = Normalizer()
    sentence = normalizer.normalize(sentence)
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(sentence)
    words = [w for w in words if len(w) > 1 ]
    words = [w.lower() for w in words]
    words = [w for w in words if not w in stop_words]
    # words = [re.split(r'([a-zA-Z]+)', w) for w in words]
    cleaned_words = []
    for word in words:
        cleaned_words_temp = re.split(r'([a-zA-Z]+)', word)
        cleaned_words.extend(word for word in cleaned_words_temp if len(word) >= 1)
    return cleaned_words

In [None]:
def tokenize(sentences):
    words = []
    for sentence in sentences:
        # print(sentence)
        w = word_extraction(sentence)
        words.extend(w)
    words = sorted(list(set(words)))
    return words

In [None]:

comments_ham_comment = train_df.query('verification_status==1')['comment'].values.tolist()
comments_ham_title = train_df.query('verification_status==1')['title'].values.tolist()
comments_spam_comment = train_df.query('verification_status==0')['comment'].values.tolist()
comments_spam_title = train_df.query('verification_status==0')['title'].values.tolist()

comments_test_comment = test_df['comment'].values.tolist()
comments_test_title = test_df['title'].values.tolist()

all_comments = []
all_comments.extend(comments_spam_comment)
all_comments.extend(comments_spam_title)
all_comments.extend(comments_ham_comment)
all_comments.extend(comments_ham_title)
all_comments.extend(comments_test_comment)
all_comments.extend(comments_test_title)

vocabulary = tokenize(all_comments)

In [None]:
ham_frequencies = {}

zero_temp = np.zeros((len(vocabulary), 1), dtype=np.int8).tolist()

comments_ham_df = train_df.query('verification_status==1')[['comment', 'title']]

# ham
counter = 0
for word in vocabulary:
    comments_mask = comments_ham_df.comment.str.contains(word)
    titles_mask = comments_ham_df.title.str.contains(word)
    aggregate_mask = (comments_mask | titles_mask)
    if (aggregate_mask.value_counts()[False] != comments_ham_df.shape[0]):
        count = (comments_mask | titles_mask).value_counts()[True]
    else:
        count = 0
    count /= train_ham_count
    ham_frequencies[word] = count
    counter += 1
    if (counter % 100 == 0):
        print(counter / len(vocabulary))

In [None]:
(pd.DataFrame.from_dict(ham_frequencies, orient='index')).to_csv(r'ham_frequencies.csv')

In [None]:
spam_frequencies = {}

zero_temp = np.zeros((len(vocabulary), 1), dtype=np.int8).tolist()

comments_spam_df = train_df.query('verification_status==0')[['comment', 'title']]

#spam
counter = 0
for word in vocabulary:
    comments_mask = comments_spam_df.comment.str.contains(word)
    titles_mask = comments_spam_df.title.str.contains(word)
    aggregate_mask = (comments_mask | titles_mask)
    if (aggregate_mask.value_counts()[False] != comments_spam_df.shape[0]):
        count = (comments_mask | titles_mask).value_counts()[True]
    else:
        count = 0
    count /= train_spam_count
    spam_frequencies[word] = count
    counter += 1
    if (counter % 100 == 0):
        print(counter / len(vocabulary))


In [None]:
(pd.DataFrame.from_dict(spam_frequencies, orient='index')).to_csv(r'spam_frequencies.csv')

In [None]:
ham_frequencies = pd.read_csv(r'ham_frequencies.csv').T.to_dict('list')
spam_frequencies = pd.read_csv(r'spam_frequencies.csv').T.to_dic('list')

In [None]:
# Generating answers for test
answers = {}
for index, row in test_df.iterrows():
    # print(row['title'], 'alwkdlawkjdlkawj', row['comment'])
    current_id = row['id']
    current_comment = row['comment']
    current_title = row['title']
    words = word_extraction(current_comment)
    words.extend(word_extraction(current_title))
    spam_prob = 1
    ham_prob = 1
    for word in words:
        if (spam_frequencies[word] != 0):
            spam_prob *= spam_frequencies[word]
        if (ham_frequencies[word] != 0):
            ham_prob *= ham_frequencies[word]
    if (spam_prob > ham_prob):
        answers[current_id] = 0
    else:
        answers[current_id] = 1

In [None]:
answers_df = pd.DataFrame(list(answers.items()), columns=['id', 'verification_status'])
answers_df = answers_df.set_index('id')
answers_df.to_csv(r'ans.csv')