In [1]:
import numpy as np
import scipy as sp
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB


In [2]:
# utilities
def build_split(data, labels, indices):
    d = []
    l = []
    for i in indices:
        d.append(data[i])
        l.append(labels[i])
    return (d, l)

In [3]:
###### Pre-Processing #####
# open docs file and read its lines
with open("../data/SMSSpamCollection", "r", encoding="utf8") as fh:
    lines = fh.readlines()  

In [4]:
# dedupe the original data
lines = list(set(lines))

# split data
labels = []
text = []

for line in lines:
    labels.append(line.split('\t')[0])
    text.append(line.split('\t')[1])


for i in range(len(labels)):
    if labels[i] == "ham":
        labels[i] = "not_spam"
    else:
        labels[i] = "SPAM"

In [5]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB

###### Training #####
vectorizer = CountVectorizer()
mat_train = vectorizer.fit_transform(text)

bnb = BernoulliNB()
bnb.fit(mat_train, labels)



In [6]:
###### Input File Processing #####
in_path = "../input"
in_files = []

for f in os.listdir(in_path):
    if os.path.isfile(os.path.join(in_path, f)):
        in_files.append(os.path.join(in_path, f))

In [7]:
in_fn = in_files[0]
with open(in_fn, "r", encoding="utf8") as fh:
    txt = fh.readlines()

In [8]:
##### Predict #####
def predict_input(file, vect, model):
    mat_in = vect.transform(file)
    
    p = model.predict(mat_in)
    c = np.max(bnb.predict_proba(mat_in), axis=1)
    return p, c # predicted outcome and certainty

In [9]:
results = [] # for the whole session

for in_fn in in_files:
    if(not in_fn.endswith(".DS_Store")):
        with open(in_fn, "r", encoding="utf8") as fh:
            txt = fh.readlines()

        results.append(predict_input(txt, vectorizer, bnb))

In [10]:
##### Write Output #####

In [12]:
out_path = "../output"
out_files = []

for f in os.listdir(in_path):
    if(not f.endswith(".DS_Store")):
        out_files.append(f + "_results.csv")
    
"""for fn in in_files:
    if(not fn.endswith(".DS_Store")):
        name = fn.split('\\')
        print(name)
        out_files.append(name + "_results.csv")"""

print(out_files)

['batch_1_results.csv', 'batch_2_results.csv']


In [15]:
for file in out_files:
    out_fn = os.path.join(out_path, file)
    print(out_fn)
    file_ind = out_files.index(file)
    predictions = results[file_ind][0]
    certainty = results[file_ind][1]

    f_out = open(out_fn, "w")
    for i in range(len(txt)):
        p = predictions[i].astype(str)
        c = round(certainty[i], 2).astype(str)
        f_out.write(p + ',' + c + "\n")
    f_out.close()

    check = open(out_fn, "r")
    print(check.read())
    check.close()

../output/batch_1_results.csv
SPAM,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,0.94
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0

../output/batch_2_results.csv
not_spam,0.99
SPAM,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0

