In [253]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

In [254]:
df = pd.read_csv('spam.csv', encoding='Windows-1252')

In [255]:
for n, g in df.groupby(['v1']): 
    print(n, len(g))

ham 4825
spam 747


In [256]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [257]:
lenc = LabelEncoder()

X = df['v2']
y = lenc.fit_transform(df['v1']) 

lenc.inverse_transform(np.array([0, 1]))

  if diff:


array(['ham', 'spam'], dtype=object)

In [258]:
partitioner = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)

indexes = partitioner.split(X, y)
train_indexes, test_indexes = list(indexes)[0]

X_train, y_train = X[train_indexes], y[train_indexes]
X_test, y_test = X[test_indexes], y[test_indexes]

In [259]:
print('Train ham size:', len(y_train[y_train == 0]), 'Train spam size:', len(y_train[y_train == 1]))
print('Test ham size:', len(y_test[y_test == 0]), 'Test spam size:', len(y_test[y_test == 1]))

Train ham size: 3377 Train spam size: 523
Test ham size: 1448 Test spam size: 224


In [260]:
X_train.head(5)

3467    yay! finally lol. i missed our cinema trip las...
1164    Well. Im computerless. Time to make some oreo ...
5059                       I think i am disturbing her da
1721    Am watching house ‰ÛÒ very entertaining ‰ÛÒ am...
1921            No current and food here. I am alone also
Name: v2, dtype: object

In [316]:
class SpamNaiveBayes(object):
    
    def __init__(self, prior_prob=0.5, n_gram=(1, 1)):
        self.prior_prob = prior_prob
        
        self.vectorizer = CountVectorizer(analyzer='word', stop_words=None, ngram_range=n_gram, max_features=None)
        self.spam_vectorizer = CountVectorizer(analyzer='word', stop_words=None, ngram_range=n_gram, max_features=None)
        self.ham_vectorizer = CountVectorizer(analyzer='word', stop_words=None, ngram_range=n_gram, max_features=None)
        
        self.spam_vector = dict()
        self.ham_vector = dict()
        
        self.spam_frequencies = dict()
        self.ham_frequencies = dict()
        
    def fit(self, X, y): 
        self.vectorizer.fit(X)
        self.ham_vectorizer.fit(X_train[y_train == 0])
        self.spam_vectorizer.fit(X_train[y_train == 1])
        
        for w, c in self.vectorizer.vocabulary_.items(): 
            float_c = float(c)
            spam_freq = self.spam_vectorizer.vocabulary_.get(w, 0) / float_c
            try:
                ham_freq = self.ham_vectorizer.vocabulary_.get(w, 0) / float_c
            except ZeroDivisionError:
                ham_freq = 0
            
            if spam_freq == 0:
                spam_freq = 0.00001
            if ham_freq == 0:
                ham_freq = 0.00001
            
            self.spam_frequencies[w] = spam_freq
            self.ham_frequencies[w] = ham_freq
            
            self.spam_vector[w] = np.log(spam_freq)
            self.ham_vector[w] = np.log(ham_freq)
            
        self.analyzer = self.vectorizer.build_analyzer()

    def predict(self, X): 
        probs = self.predict_probabilities(X)
        return np.argmax(probs, axis=1)
    
    def predict_probabilities(self, X):
        probs = np.zeros((X.shape[0], 2))
        for i, example in enumerate(X): 
            e = self.analyzer(example)
            
            spam_prob = 0
            ham_prob = 0
            for word in e: 
                spam_prob += self.spam_vector.get(word, 0)
                ham_prob += self.ham_vector.get(word, 0)
                        
            probs[i, 0] = np.exp(ham_prob) * self.prior_prob
            probs[i, 1] = np.exp(spam_prob) * self.prior_prob
        return probs


In [317]:
classifier = SpamNaiveBayes(n_gram=(1, 2))

classifier.fit(X_train, y_train)



In [318]:
y_pred_train = classifier.predict(X_train)
print('Train Accuracy', accuracy_score(y_train, y_pred_train))
print('Train F1 score', f1_score(y_train, y_pred_train))

y_pred_test = classifier.predict(X_test)
print('Test Accuracy', accuracy_score(y_test, y_pred_test))
print('Test F1 score', f1_score(y_test, y_pred_test))


Train Accuracy 1.0
Train F1 score 1.0
Test Accuracy 0.9754784688995215
Test F1 score 0.900726392251816


In [319]:
confusion_matrix(y_train, y_pred_train)

array([[3377,    0],
       [   0,  523]])

In [320]:
confusion_matrix(y_test, y_pred_test)

array([[1445,    3],
       [  38,  186]])

In [332]:
import random
from sklearn.utils import shuffle

words = []

for i in range(10):  
    n = random.random()
    s = 0
    for w, f in shuffle(list(classifier.spam_frequencies.items())):
        s += f
        if s >= n: 
            words.append(w)
            break
            
print(words)

['25p free', 'question this', '0845 021', 'join take', 'send', '6031 between', 'god ve', '542 0825', 'subscribed textcomp', 'six chances']
