In [37]:
import nltk
nltk.download('stopwords')

import itertools
from tqdm import tqdm
import pandas as pd
import string
import numpy as np
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

english_stop_words = stopwords.words("english")

data = pd.read_csv("enronSpamSubset.csv")
train_data, test_data = train_test_split(data)




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/silviu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
class CLASIFICATORUL_INTELIGENT():
    def __init__(self, data, no_classes=2):
        self.data = data
        self.no_classes = no_classes
        self.prob_map = {i: {} for i in range(no_classes)}
        self.spam_words = []
        self.ham_words = []
        
    def prepare_data(self):
        for row_index, row in tqdm(self.data.iterrows()):
            body = row["Body"]
            label = row["Label"]
            
            words = self.clean_body(body)
            
            if label == 1: # verif daca este spam
                self.spam_words.append(words)
            else:
                self.ham_words.append(words)

        self.print_stats()
        self.spam_words = list(itertools.chain.from_iterable(self.spam_words))
        self.ham_words = list(itertools.chain.from_iterable(self.ham_words))
        
    def predict(self, body):
        words = self.clean_body(body)
        
        ham_log_prob = self._handle_predict(words, 0)
        spam_log_prob = self._handle_predict(words, 1)

        return np.argmax([ham_log_prob, spam_log_prob])
        
        
    def _handle_predict(self, words, class_index):
        log_probs = []
        for word in words:
            if word in self.prob_map[class_index]:
                proba = self.prob_map[class_index][word]
            else:
                proba = 1e-8
            
            log_probs.append(np.log(proba))
        return np.sum(log_probs)
        
                
    def fit(self):
        smoothing_factor = len(np.unique(self.spam_words + self.ham_words))
        
        self._handle_fit(self.spam_words, 1, smoothing_factor)
        self._handle_fit(self.ham_words, 0, smoothing_factor)
        
    def _handle_fit(self, words, class_index, smoothing_factor):
        from collections import Counter
        words_counter = Counter(words)
        for word in words_counter:
            self.prob_map[class_index][word] = (words_counter[word] + 1)/(len(words) + smoothing_factor)
    
    def print_stats(self):
        self._stat_helper(self.spam_words, "<SPAM>")
        self._stat_helper(self.ham_words, "<HAM>")
        
    def _stat_helper(self, matrix, class_name):
        lens = list(map(lambda x: len(x), matrix))
        print(f"FACEM STAT PENTRU: {class_name}")
        print(f"Media de cuvinte este: {np.mean(lens)}")
        print("==========================================")
        
    def clean_body(self, body):
        words = wordpunct_tokenize(body) # avem cuvinte
        words = self.make_lower(words)
        words = self.remove_stop_words(words)
        words = self.remove_punctuation(words)
        words = self.replace_num(words)
        
        return words
        
    def make_lower(self, words):
        return [word.lower() for word in words]
    
    def remove_stop_words(self, words):
        return [word for word in words if word not in english_stop_words]
    
    def remove_punctuation(self, words):
        return [word for word in words if word not in string.punctuation]
    
    def replace_num(self, words):
        return ["<SUPERBET>" if word.isdigit() else word for word in words]
    

In [53]:
clasificatorul = CLASIFICATORUL_INTELIGENT(data, 2)
clasificatorul.prepare_data()
clasificatorul.fit()

10000it [00:06, 1508.77it/s]


FACEM STAT PENTRU: <SPAM>
Media de cuvinte este: 139.0178
FACEM STAT PENTRU: <HAM>
Media de cuvinte este: 181.0638


In [79]:
clasificatorul.predict("capitalism")

0

In [77]:
def evalueaza_model(model, test_data):
    from sklearn.metrics import classification_report
    predicted_labels = []
    true_labels = []
    
    for row_index, row in test_data.iterrows():
        
        body = row["Body"]
        label = row["Label"]
        
        true_labels.append(label)
        predicted_labels.append(model.predict(body))
    
    print(classification_report(true_labels, predicted_labels))
    
evalueaza_model(clasificatorul, test_data)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1241
           1       1.00      0.99      0.99      1259

    accuracy                           0.99      2500
   macro avg       0.99      0.99      0.99      2500
weighted avg       0.99      0.99      0.99      2500

