In [None]:

import string
import re

import numpy as np
import pandas as pd
from hazm import *

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df['comment'] = train_df['comment'].fillna('XXXXXXXX')
test_df['comment'] = test_df['comment'].fillna('XXXXXXXX')
train_df['title'] = train_df['title'].fillna('XXXXXXXX')
test_df['title'] = test_df['title'].fillna('XXXXXXXX')

train_df['text'] = train_df[['title', 'comment']].apply(lambda x: ' '.join(x), axis=1)
test_df['text'] = test_df[['title', 'comment']].apply(lambda x: ' '.join(x), axis=1)

train_df = train_df[['text', 'verification_status']]
test_df = test_df[['id', 'text']]

train_spam_count = train_df.query('verification_status==0')['verification_status'].count()
train_ham_count = train_df.query('verification_status==1')['verification_status'].count()

In [None]:
with open(r'stop_words') as f:
    stop_words = f.read().splitlines()
def word_extraction(sentence):
    normalizer = Normalizer()
    sentence = normalizer.normalize(sentence)
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(sentence)
    words = [w for w in words if len(w) > 1 ]
    words = [w.lower() for w in words]
    words = [w for w in words if not w in stop_words]
    # words = [re.split(r'([a-zA-Z]+)', w) for w in words]
    cleaned_words = []
    for word in words:
        cleaned_words_temp = re.split(r'([a-zA-Z]+)', word)
        cleaned_words.extend(word for word in cleaned_words_temp if len(word) >= 1)
    return cleaned_words

In [None]:
def tokenize(sentences):
    words = []
    for sentence in sentences:
        # print(sentence)
        w = word_extraction(sentence)
        words.extend(w)
    words = sorted(list(set(words)))
    return words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

all_comments_df = pd.concat([train_df['text'], test_df['text']], axis=0)
all_comments_df = all_comments_df.to_frame()
all_comments_df.columns = ['text']

messages_bow = CountVectorizer(analyzer=word_extraction).fit_transform(all_comments_df['text'])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow[0:160000], train_df['verification_status'], test_size = 0.3, random_state=0)

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(messages_bow[160000:180001])
output = test_df.assign(verification_status = y_pred)
output = output[['id', 'verification_status']]
output = output.set_index('id')
output.to_csv(r'ans.csv')