In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from collections import defaultdict
from math import log
from sklearn.metrics import mean_squared_error, f1_score

In [40]:
path = "messages/part"
SPAM = "spam"
LEGIT = "legit"
def getFiles(folderNumber):
    curPath = path + str(folderNumber)
    files = []
    for filename in os.listdir(curPath):
        file = open(curPath + "/" + filename, 'r')
        subject = list(map(int, file.readline().split()[1:]))
        file.readline()
        text = list(map(int, file.readline().split()))
        #print(filename)
        if LEGIT in filename:
            files.append((LEGIT, [subject, text]))
        else:
            files.append((SPAM, [subject, text]))
    return files

In [78]:
def training(trains):
    frequencies = defaultdict(lambda:0)
    classes = defaultdict(lambda:0)
    counts = defaultdict(lambda:0)
    for spam_or_legit, file in trains:
        for word in file[1]:
            frequencies[spam_or_legit, word] += 1.0     #частота встречаемости слов из письма в данном классе
            counts[word] += 1.0                         #количество данного слова в сообщении
        classes[spam_or_legit] += len(file[1])          #количество слов у данного класса
    for spam_or_legit, word in frequencies:
        frequencies[spam_or_legit, word] = (frequencies[spam_or_legit, word] + alpha) / (counts[word] + alpha * len(counts))        # P(x_i | SPAM) || P(x_i | LEGIT)

    for clazz in classes:
        classes[clazz] /= len(trains)
    return classes, frequencies

In [199]:
def trainingWithSubject(trains):
    frequencies = defaultdict(lambda:0)
    classes = defaultdict(lambda:0)
    counts = defaultdict(lambda:0)
    for spam_or_legit, file in trains:
        for word in file[0]:
            #frequencies[spam_or_legit, word] += (1.0 * (int((len(file[1]) / len(file[0])))))
            frequencies[spam_or_legit, word] += 1.0
            counts[word] += 1.0
        for word in file[1]:
            frequencies[spam_or_legit, word] += 1.0
            counts[word] += 1.0
        classes[spam_or_legit] += len(file[1]) + len(file[0])     # число слов такого-то класса
    for spam_or_legit, word in frequencies:
        frequencies[spam_or_legit, word] = (frequencies[spam_or_legit, word] + alpha) / (counts[word] + alpha * len(counts))        # P(x_i | C) || P(x_i | C)
    for clazz in classes:
        classes[clazz] /= len(trains)      # число слов такого-то класса / общее число слов
    return classes, frequencies

In [200]:
errorWeight = defaultdict(lambda:1)
errorWeight[SPAM] = 1
errorWeight[LEGIT] = 1
def classify(classifier, file):
    classes, frequencies = classifier
    # result class = argmax(C, P(C)*p(P(x_i|C))) = argmin(C, -log(P(C) - sum(log(P(x_i|C)))))
    spamProb = -log(classes[SPAM])
    legitProb = -log(classes[LEGIT])
    for word in file[1]:
        spamProb -= log(frequencies[SPAM, word] + 10**(-7))
        legitProb -= log(frequencies[LEGIT, word] + 10**(-7))
    spamProb -= log(errorWeight[SPAM])
    legitProb -= log(errorWeight[LEGIT])
    if (spamProb < legitProb):
        return SPAM
    return LEGIT

In [217]:
from sklearn.metrics import accuracy_score
folds = [getFiles(i) for i in range(1, 11)]
ar = []
for f in folds:
    ar = ar + f

mean_score = []
eWeights = []
errorWeight[SPAM] = 1
errorWeight[LEGIT] = 1
best_alpha = 0.0
best_score = 0.0
#best_lambda = 10**5
xs = []
alpha = 0.01
i = 0
while errorWeight[LEGIT] < 10**10:
    print("Lambda_legit=" + str(errorWeight[LEGIT]))
    predict = []
    answer = []
    counts = [[0, 0], [0, 0]]
    cnt = 0

#while alpha <= 1.0:
    eWeights.append(errorWeight[LEGIT])
    sum_score = 0.0
    for test in folds:
        #print("test " + str(i))
        meanClassifier = defaultdict(lambda:0), defaultdict(lambda:0)
        ar = []
        for trains in folds:
            if trains != test:
                ar = ar + trains
            meanClassifier = trainingWithSubject(ar)
        for clazz, file in test:
            predClass = classify(meanClassifier, file)
            predict.append(int(predClass == SPAM))
            answer.append(int(clazz == SPAM))
            counts[predClass == SPAM][clazz == SPAM] += 1
            cnt += 1
        sum_score += (accuracy_score(predict, answer))
        #i += 1
        print("mse = ", mean_squared_error(predict, answer))
        print("accuracy = ", accuracy_score(predict, answer))
        print("f1 score = ", f1_score(predict, answer))
        print("test count = ", cnt)
        print("Спам в спам = ", counts[0][0])
        print("Спам в реальные = ", counts[0][1])
        print("Реальные в спам = ", counts[1][0])
        print("Реальные в реальные = ", counts[1][1])
    i += 1
    xs.append(i)
    mean_score.append(sum_score / 10.0)
    errorWeight[LEGIT] *= 10
#if sum_score > best_score:
    #best_score = sum_score
    #best_alpha = alpha
    #best_lambda = errorWeight[SPAM]
#alpha += 0.01

#errorWeight[SPAM] *= (10)

#print(best_score)
print(mean_score)
#print(best_lambda)
plt.figure(figsize=(15, 6))
plt.plot(eWeights, mean_score)


Lambda_legit=1
mse =  0.027522935779816515
accuracy =  0.9724770642201835
f1 score =  0.9696969696969697
test count =  109
Спам в спам =  58
Спам в реальные =  0
Реальные в спам =  3
Реальные в реальные =  48
mse =  0.022935779816513763
accuracy =  0.9770642201834863
f1 score =  0.9746192893401014
test count =  218
Спам в спам =  117
Спам в реальные =  0
Реальные в спам =  5
Реальные в реальные =  96
mse =  0.01834862385321101
accuracy =  0.981651376146789
f1 score =  0.9795918367346939
test count =  327
Спам в спам =  177
Спам в реальные =  0
Реальные в спам =  6
Реальные в реальные =  144
mse =  0.013761467889908258
accuracy =  0.9862385321100917
f1 score =  0.9846153846153847
test count =  436
Спам в спам =  238
Спам в реальные =  0
Реальные в спам =  6
Реальные в реальные =  192
mse =  0.01834862385321101
accuracy =  0.981651376146789
f1 score =  0.9793388429752066
test count =  545
Спам в спам =  298
Спам в реальные =  3
Реальные в спам =  7
Реальные в реальные =  237
mse =  0.021

mse =  0.024464831804281346
accuracy =  0.9755351681957186
f1 score =  0.9722222222222222
test count =  327
Спам в спам =  179
Спам в реальные =  4
Реальные в спам =  4
Реальные в реальные =  140


KeyboardInterrupt: 

In [None]:
plt.plot(eWeights, mean_score)
plt.figure(figsize=(15, 6))

In [215]:
predict = []
answer = []
counts = [[0, 0], [0, 0]]
cnt = 0
errorWeight[LEGIT] = 10**70
alpha = 0.01
sum_score = 0.0
for test in folds:
    meanClassifier = defaultdict(lambda:0), defaultdict(lambda:0)
    ar = []
    for trains in folds:
        if trains != test:
            ar = ar + trains
        meanClassifier = trainingWithSubject(ar)
    for clazz, file in test:
        predClass = classify(meanClassifier, file)
        predict.append(int(predClass == SPAM))
        answer.append(int(clazz == SPAM))
        counts[predClass == SPAM][clazz == SPAM] += 1
        cnt += 1
    sum_score += (accuracy_score(predict, answer))
    print("mse = ", mean_squared_error(predict, answer))
    print("accuracy = ", accuracy_score(predict, answer))
    print("f1 score = ", f1_score(predict, answer))
    print("test count = ", cnt)
    print("Спам в спам = ", counts[0][0])
    print("Спам в реальные = ", counts[0][1])
    print("Реальные в спам = ", counts[1][0])
    print("Реальные в реальные = ", counts[1][1])

mse =  0.22018348623853212
accuracy =  0.7798165137614679
f1 score =  0.6666666666666666
test count =  109
Спам в спам =  61
Спам в реальные =  24
Реальные в спам =  0
Реальные в реальные =  24
mse =  0.22935779816513763
accuracy =  0.7706422018348624
f1 score =  0.647887323943662
test count =  218
Спам в спам =  122
Спам в реальные =  50
Реальные в спам =  0
Реальные в реальные =  46
mse =  0.23547400611620795
accuracy =  0.764525993883792
f1 score =  0.6350710900473934
test count =  327
Спам в спам =  183
Спам в реальные =  77
Реальные в спам =  0
Реальные в реальные =  67
mse =  0.22935779816513763
accuracy =  0.7706422018348624
f1 score =  0.647887323943662
test count =  436
Спам в спам =  244
Спам в реальные =  100
Реальные в спам =  0
Реальные в реальные =  92
mse =  0.23302752293577983
accuracy =  0.7669724770642202
f1 score =  0.6402266288951842
test count =  545
Спам в спам =  305
Спам в реальные =  127
Реальные в спам =  0
Реальные в реальные =  113
mse =  0.23700305810397554