In [16]:
import numpy as np
import scipy as sp
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB


In [17]:
# utilities
def build_split(data, labels, indices):
    d = []
    l = []
    for i in indices:
        d.append(data[i])
        l.append(labels[i])
    return (d, l)

In [18]:
###### Pre-Processing #####
# open docs file and read its lines
with open("../data/SMSSpamCollection", "r", encoding="utf8") as fh:
    lines = fh.readlines()  

In [19]:
# dedupe the original data
lines = list(set(lines))

# split data
labels = []
text = []

for line in lines:
    labels.append(line.split('\t')[0])
    text.append(line.split('\t')[1])


for i in range(len(labels)):
    if labels[i] == "ham":
        labels[i] = "not_spam"
    else:
        labels[i] = "SPAM"

In [24]:
from random import sample

short = sample(text, 50)
print(short)

f_out = open("sample", "w")
for s in short:
    f_out.write(s)
f_out.close()

["I know where the  &lt;#&gt;  is, I'll be there around 5\n", 'Sometimes we put walls around our hearts,not just to be safe from getting hurt.. But to find out who cares enough to break the walls &amp; get closer.. GOODNOON:)\n', "Misplaced your number and was sending texts to your old number. Wondering why i've not heard from you this year. All the best in your mcat. Got this number from my atlanta friends\n", 'She doesnt need any test.\n', 'As a registered optin subscriber ur draw 4 £100 gift voucher will be entered on receipt of a correct ans to 80062 Whats No1 in the BBC charts\n', "K I'll head out in a few mins, see you there\n", "So how's scotland. Hope you are not over showing your JJC tendencies. Take care. Live the dream\n", "Its not the same here. Still looking for a job. How much do Ta's earn there.\n", 'What part of "don\'t initiate" don\'t you understand\n', "I haven't forgotten you, i might have a couple bucks to send you tomorrow, k? I love ya too\n", "The last thing i e

In [5]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB

###### Training #####
vectorizer = CountVectorizer()
mat_train = vectorizer.fit_transform(text)

bnb = BernoulliNB()
bnb.fit(mat_train, labels)



In [9]:
###### Input File Processing #####
in_path = "../input"
in_files = []

for f in os.listdir(in_path):
    if(not f.endswith(".DS_Store")): # for cross-platform work
        if os.path.isfile(os.path.join(in_path, f)):
            in_files.append(os.path.join(in_path, f))

In [11]:
##### Predict #####
def predict_input(file, vect, model):
    mat_in = vect.transform(file)
    
    p = model.predict(mat_in)
    c = np.max(bnb.predict_proba(mat_in), axis=1)
    return p, c # predicted outcome and certainty

In [12]:
results = [] # for the whole session

for in_fn in in_files:
    if(not in_fn.endswith(".DS_Store")):
        with open(in_fn, "r", encoding="utf8") as fh:
            txt = fh.readlines()

        results.append(predict_input(txt, vectorizer, bnb))

In [13]:
##### Write Output #####

In [14]:
out_path = "../output"
out_files = []

for f in os.listdir(in_path):
    if(not f.endswith(".DS_Store")): # for cross-platform work
        out_files.append(f + "_results.csv")

['batch_1_results.csv', 'batch_2_results.csv']


In [17]:
for file in out_files:
    out_fn = os.path.join(out_path, file)
    file_ind = out_files.index(file)
    predictions = results[file_ind][0]
    certainty = results[file_ind][1]

    f_out = open(out_fn, "w")
    f_out.write("Prediction, Certainty" + "\n")
    f_out.close()
    
    f_out = open(out_fn, "a")
    for i in range(len(txt)):
        p = predictions[i].astype(str)
        c = round(certainty[i], 2).astype(str)
        f_out.write(p + ',' + c + "\n")
    f_out.close()

    check = open(out_fn, "r")
    print(check.read())
    check.close()

Prediction, Certainty
SPAM,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,0.94
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0

Prediction, Certainty
not_spam,0.99
SPAM,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0
not_spam,1.0

