In [1]:
# Naive Bayes without library

from __future__ import unicode_literals

import re
import string

import numpy as np
import pandas as pd

from hazm import *
from cleantext import clean

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df['comment'] = train_df['comment'].fillna('XXXXXXXX')
test_df['comment'] = test_df['comment'].fillna('XXXXXXXX')
train_df['title'] = train_df['title'].fillna('XXXXXXXX')
test_df['title'] = test_df['title'].fillna('XXXXXXXX')

train_spam_count = train_df.query('verification_status==0')['verification_status'].count()
train_ham_count = train_df.query('verification_status==1')['verification_status'].count()

In [3]:
def word_extraction(sentence):
    normalizer = Normalizer()
    sentence = normalizer.normalize(sentence)
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(sentence)
    words = [w for w in words if len(w) > 1 ]
    words = [w.lower() for w in words]
    # words = [re.split(r'([a-zA-Z]+)', w) for w in words]
    cleaned_words = []
    for word in words:
        cleaned_words_temp = re.split(r'([a-zA-Z]+)', word)
        cleaned_words.extend(word for word in cleaned_words_temp if len(word) >= 1)
    return cleaned_words

In [4]:
def tokenize(sentences):
    words = []
    for sentence in sentences:
        # print(sentence)
        w = word_extraction(sentence)
        words.extend(w)
    words = sorted(list(set(words)))
    return words

In [5]:

comments_ham_comment = train_df.query('verification_status==1')['comment'].values.tolist()
comments_ham_title = train_df.query('verification_status==1')['title'].values.tolist()
comments_spam_comment = train_df.query('verification_status==0')['comment'].values.tolist()
comments_spam_title = train_df.query('verification_status==0')['title'].values.tolist()

comments_test_comment = test_df['comment'].values.tolist()
comments_test_title = test_df['title'].values.tolist()

all_comments = []
all_comments.extend(comments_spam_comment)
all_comments.extend(comments_spam_title)
all_comments.extend(comments_ham_comment)
all_comments.extend(comments_ham_title)
all_comments.extend(comments_test_comment)
all_comments.extend(comments_test_title)

vocabulary = tokenize(all_comments)

In [6]:
ham_frequencies = {}

zero_temp = np.zeros((len(vocabulary), 1), dtype=np.int8).tolist()

comments_ham_df = train_df.query('verification_status==1')[['comment', 'title']]

# ham
counter = 0
for word in vocabulary:
    comments_mask = comments_ham_df.comment.str.contains(word)
    titles_mask = comments_ham_df.title.str.contains(word)
    aggregate_mask = (comments_mask | titles_mask)
    if (aggregate_mask.value_counts()[False] != comments_ham_df.shape[0]):
        count = (comments_mask | titles_mask).value_counts()[True]
    else:
        count = 0
    count /= train_ham_count
    ham_frequencies[word] = count
    counter += 1
    if (counter % 100 == 0):
        print(counter / len(vocabulary))

0.0008488171732690496
0.0016976343465380992
0.0025464515198071486
0.0033952686930761983
0.004244085866345248
0.005092903039614297
0.005941720212883347
0.006790537386152397
0.007639354559421446
0.008488171732690496
0.009336988905959546
0.010185806079228595
0.011034623252497645
0.011883440425766695
0.012732257599035743
0.013581074772304793
0.014429891945573843
0.015278709118842892
0.016127526292111942
0.016976343465380992
0.017825160638650042
0.018673977811919092
0.01952279498518814
0.02037161215845719
0.02122042933172624
0.02206924650499529
0.02291806367826434
0.02376688085153339
0.024615698024802436
0.025464515198071486
0.026313332371340536
0.027162149544609587
0.028010966717878637
0.028859783891147687
0.029708601064416737
0.030557418237685784
0.03140623541095484
0.032255052584223884
0.03310386975749293
0.033952686930761984
0.03480150410403103
0.035650321277300084
0.03649913845056913
0.037347955623838185
0.03819677279710723
0.03904558997037628
0.03989440714364533
0.04074322431691438
0.

0.35310794407992463
0.3539567612531937
0.3548055784264627
0.3556543955997318
0.35650321277300084
0.35735202994626986
0.3582008471195389
0.359049664292808
0.359898481466077
0.36074729863934607
0.36159611581261514
0.36244493298588415
0.3632937501591532
0.3641425673324223
0.3649913845056913
0.36584020167896036
0.36668901885222943
0.36753783602549844
0.3683866531987675
0.3692354703720366
0.3700842875453056
0.37093310471857466
0.3717819218918437
0.3726307390651128
0.3734795562383818
0.37432837341165087
0.37517719058491994
0.37602600775818895
0.376874824931458
0.3777236421047271
0.3785724592779961
0.37942127645126517
0.38027009362453423
0.38111891079780325
0.3819677279710723
0.3828165451443414
0.3836653623176104
0.38451417949087946
0.3853629966641485
0.38621181383741754
0.3870606310106866
0.3879094481839557
0.3887582653572247
0.38960708253049375
0.3904558997037628
0.39130471687703183
0.3921535340503009
0.39300235122356997
0.393851168396839
0.39469998557010805
0.3955488027433771
0.39639761991

0.718099328585616
0.718948145758885
0.719796962932154
0.7206457801054231
0.7214945972786921
0.7223434144519612
0.7231922316252303
0.7240410487984993
0.7248898659717683
0.7257386831450374
0.7265875003183064
0.7274363174915754
0.7282851346648446
0.7291339518381136
0.7299827690113826
0.7308315861846517
0.7316804033579207
0.7325292205311897
0.7333780377044589
0.7342268548777279
0.7350756720509969
0.735924489224266
0.736773306397535
0.737622123570804
0.7384709407440732
0.7393197579173422
0.7401685750906112
0.7410173922638803
0.7418662094371493
0.7427150266104183
0.7435638437836875
0.7444126609569565
0.7452614781302256
0.7461102953034946
0.7469591124767636
0.7478079296500327
0.7486567468233017
0.7495055639965708
0.7503543811698399
0.7512031983431089
0.7520520155163779
0.752900832689647
0.753749649862916
0.754598467036185
0.7554472842094542
0.7562961013827232
0.7571449185559922
0.7579937357292613
0.7588425529025303
0.7596913700757993
0.7605401872490685
0.7613890044223375
0.7622378215956065
0.

In [11]:
(pd.DataFrame.from_dict(ham_frequencies, orient='index')).to_csv(r'ham_frequencies.csv')

In [7]:
spam_frequencies = {}

zero_temp = np.zeros((len(vocabulary), 1), dtype=np.int8).tolist()

comments_spam_df = train_df.query('verification_status==0')[['comment', 'title']]

#spam
counter = 0
for word in vocabulary:
    comments_mask = comments_spam_df.comment.str.contains(word)
    titles_mask = comments_spam_df.title.str.contains(word)
    aggregate_mask = (comments_mask | titles_mask)
    if (aggregate_mask.value_counts()[False] != comments_spam_df.shape[0]):
        count = (comments_mask | titles_mask).value_counts()[True]
    else:
        count = 0
    count /= train_spam_count
    spam_frequencies[word] = count
    counter += 1
    if (counter % 100 == 0):
        print(counter / len(vocabulary))


0.0008488171732690496
0.0016976343465380992
0.0025464515198071486
0.0033952686930761983
0.004244085866345248
0.005092903039614297
0.005941720212883347
0.006790537386152397
0.007639354559421446
0.008488171732690496
0.009336988905959546
0.010185806079228595
0.011034623252497645
0.011883440425766695
0.012732257599035743
0.013581074772304793
0.014429891945573843
0.015278709118842892
0.016127526292111942
0.016976343465380992
0.017825160638650042
0.018673977811919092
0.01952279498518814
0.02037161215845719
0.02122042933172624
0.02206924650499529
0.02291806367826434
0.02376688085153339
0.024615698024802436
0.025464515198071486
0.026313332371340536
0.027162149544609587
0.028010966717878637
0.028859783891147687
0.029708601064416737
0.030557418237685784
0.03140623541095484
0.032255052584223884
0.03310386975749293
0.033952686930761984
0.03480150410403103
0.035650321277300084
0.03649913845056913
0.037347955623838185
0.03819677279710723
0.03904558997037628
0.03989440714364533
0.04074322431691438
0.

0.35310794407992463
0.3539567612531937
0.3548055784264627
0.3556543955997318
0.35650321277300084
0.35735202994626986
0.3582008471195389
0.359049664292808
0.359898481466077
0.36074729863934607
0.36159611581261514
0.36244493298588415
0.3632937501591532
0.3641425673324223
0.3649913845056913
0.36584020167896036
0.36668901885222943
0.36753783602549844
0.3683866531987675
0.3692354703720366
0.3700842875453056
0.37093310471857466
0.3717819218918437
0.3726307390651128
0.3734795562383818
0.37432837341165087
0.37517719058491994
0.37602600775818895
0.376874824931458
0.3777236421047271
0.3785724592779961
0.37942127645126517
0.38027009362453423
0.38111891079780325
0.3819677279710723
0.3828165451443414
0.3836653623176104
0.38451417949087946
0.3853629966641485
0.38621181383741754
0.3870606310106866
0.3879094481839557
0.3887582653572247
0.38960708253049375
0.3904558997037628
0.39130471687703183
0.3921535340503009
0.39300235122356997
0.393851168396839
0.39469998557010805
0.3955488027433771
0.39639761991

0.718099328585616
0.718948145758885
0.719796962932154
0.7206457801054231
0.7214945972786921
0.7223434144519612
0.7231922316252303
0.7240410487984993
0.7248898659717683
0.7257386831450374
0.7265875003183064
0.7274363174915754
0.7282851346648446
0.7291339518381136
0.7299827690113826
0.7308315861846517
0.7316804033579207
0.7325292205311897
0.7333780377044589
0.7342268548777279
0.7350756720509969
0.735924489224266
0.736773306397535
0.737622123570804
0.7384709407440732
0.7393197579173422
0.7401685750906112
0.7410173922638803
0.7418662094371493
0.7427150266104183
0.7435638437836875
0.7444126609569565
0.7452614781302256
0.7461102953034946
0.7469591124767636
0.7478079296500327
0.7486567468233017
0.7495055639965708
0.7503543811698399
0.7512031983431089
0.7520520155163779
0.752900832689647
0.753749649862916
0.754598467036185
0.7554472842094542
0.7562961013827232
0.7571449185559922
0.7579937357292613
0.7588425529025303
0.7596913700757993
0.7605401872490685
0.7613890044223375
0.7622378215956065
0.

In [10]:
(pd.DataFrame.from_dict(spam_frequencies, orient='index')).to_csv(r'spam_frequencies.csv')