### Imports

In [5]:
from os import walk
from os.path import join
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import numpy as np

%matplotlib

Using matplotlib backend: Qt5Agg


###  Constants

In [2]:
EXAMPLE_FILE="SpamData/01_Processing/practice_email.txt"
SPAM_PATH_1="SpamData/01_Processing/spam_assassin_corpus/spam_1"
SPAM_PATH_2="SpamData/01_Processing/spam_assassin_corpus/spam_2"
EASY_NONSPAM_PATH_1="SpamData/01_Processing/spam_assassin_corpus/easy_ham_1"
EASY_NONSPAM_PATH_2="SpamData/01_Processing/spam_assassin_corpus/easy_ham_2"
DATA_JSON="SpamData/01_Processing/email-text-data.json"

TOKEN_SPAM_PROB_FILE = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = 'SpamData/03_Testing/prob-all-tokens.txt'

TEST_FEATURE_MATRIX = 'SpamData/03_Testing/test-features.txt'
TEST_TARGET_FILE = 'SpamData/03_Testing/test-target.txt'

VOCAB_SIZE = 2500

### Email Body Generator Function

In [3]:
def email_body_generator(path):
    for root,directories,file_names in walk(path):
        for file_name in file_names:
            file_path=join(root,file_name)
            stream=open(file_path,encoding='latin-1')
            is_body=False
            lines=[]
            for line in stream:
                if is_body:
                    lines.append(line)
                elif line=='\n':
                    is_body=True
            stream.close()
            email_body='\n'.join(lines)
            yield file_name,email_body

### DataFrame Generator Function

In [4]:
def df_generator(path,classification):
    rows=[]
    row_names=[]
    for file_name,email_body in email_body_generator(path):
        rows.append({"CATEGORY":classification,"MESSAGE":email_body})
        row_names.append(file_name)
    return pd.DataFrame(rows,index=row_names)

### Making Dataset (Spam,Ham,All mails)

In [6]:
spam_emails=df_generator(SPAM_PATH_1,1)
spam_emails=spam_emails.append(df_generator(SPAM_PATH_2,1))

In [7]:
ham_emails=df_generator(EASY_NONSPAM_PATH_1,0)
ham_emails=ham_emails.append(df_generator(EASY_NONSPAM_PATH_2,0))

In [8]:
data=pd.concat([spam_emails,ham_emails])

### Changing Filename to Document ID's and setting as Index

In [10]:
document_ids=range(0,len(data.index))
data['DOC_ID']=document_ids
data.set_index('DOC_ID',inplace=True)

### Removing messages with 0 length messages

In [12]:
# data.drop(['cmds'],inplace=True)

### Converting Data to JSON

In [13]:
data.to_json(DATA_JSON)

###  Spam and Ham Data Visulaization

In [14]:
data.CATEGORY.value_counts()
amount_of_spam=data.CATEGORY.value_counts()[0]
amount_of_ham=data.CATEGORY.value_counts()[1]
category_names=['SPAM','LEGIT']
sizes=[amount_of_spam,amount_of_ham]
custom_colors=['#55efc4','#ffeaa7']
plt.figure(figsize=(2,2),dpi=227)
plt.pie(sizes,labels=category_names,textprops={'fontsize':8},startangle=90,autopct='%1.0f%%',colors=custom_colors,explode=[0,0.1])
plt.show()

## Cleaning and Data Pre-Processing

### Downloading NLTK for removing StopWords and Punctuations

In [15]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shantanu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shantanu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Function To Clean Messages

In [16]:
def clean_msg(message,stemmer=PorterStemmer(),stop_words=set(stopwords.words('english'))):
    soup=BeautifulSoup(message,'html.parser')
    cleand_text=soup.get_text()
    filtered_words=[]
    words=word_tokenize(cleand_text.lower())
    for word in words:
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
    return filtered_words


In [17]:
nested_list=data.MESSAGE.apply(clean_msg)
flat_list=[item for sublist in nested_list for item in sublist]
unique_words=pd.Series(flat_list).value_counts()

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup


In [18]:
spam_doc_ids=data[data.CATEGORY==1].index
ham_doc_ids=data[data.CATEGORY==0].index

In [19]:
nested_list_ham=nested_list.loc[ham_doc_ids]
nested_list_spam=nested_list.loc[spam_doc_ids]

In [20]:
flat_list_spam=[item for sublist in nested_list_spam for item in sublist]
flat_list_ham=[item for sublist in nested_list_ham for item in sublist]
normal_words=pd.Series(flat_list_ham).value_counts()
spammy_words=pd.Series(flat_list_spam).value_counts()

In [21]:
frequent_words=unique_words[:VOCAB_SIZE]

In [22]:
word_ids=list(range(0,VOCAB_SIZE))
vocab=pd.DataFrame({'VOCAB_WORD':frequent_words.index.values},index=word_ids)
vocab.index.name='WORD_ID'

### Creating Word Column Dataset

In [23]:
word_column_df=pd.DataFrame.from_records(nested_list.tolist())

### Spliting Training and Testing Data

In [24]:
X_train,X_test,y_train,y_test=train_test_split(word_column_df,data.CATEGORY,test_size=0.3,random_state=42)

In [25]:
X_train.index.name=X_test.index.name='DOC_ID'

### Creating A Word Index

In [26]:
word_index=pd.Index(vocab.VOCAB_WORD)

### Creating Sparse Matrix

In [27]:
def make_sparse_matrix(df,index_words,lables):
    nr_rows=df.shape[0]
    nr_cols=df.shape[1]
    word_set=set(index_words)
    dict_list=[]
    for i in range(nr_rows):
        for j in range(nr_cols):
            word=df.iat[i,j]
            if word in word_set:
                doc_id=df.index[i]
                word_id=index_words.get_loc(word)
                category=lables.at[doc_id]
                item={'LABEL':category,'DOC_ID':doc_id,'OCCURENCE':1,'WORD_ID':word_id}
                dict_list.append(item)
    return pd.DataFrame(dict_list)

### Train Data Sparse Matrix

In [28]:
%%time
sparse_train_matrix=make_sparse_matrix(X_train,word_index,y_train)

Wall time: 3min 34s


In [29]:
trained_group=sparse_train_matrix.groupby(['DOC_ID','WORD_ID','LABEL']).sum()
trained_group=trained_group.reset_index()
sparse_train_data=trained_group.to_numpy()

### Test Data Sparse Matrix

In [31]:
%%time
sparse_test_matrix=make_sparse_matrix(X_test,word_index,y_test)

Wall time: 1min 23s


In [32]:
test_group=sparse_test_matrix.groupby(['DOC_ID','WORD_ID','LABEL']).sum()
test_group=test_group.reset_index()
sparse_test_data=test_group.to_numpy()

### Creating A Full Matrix From Sparse Matrix

In [33]:
def make_full_matrix(sparse_matrix,nr_words,doc_idx=0,word_idx=1,cat_idx=2,freq_idx=3):
    index_names=np.unique(sparse_matrix[:,0])
    column_names=['DOC_ID']+['CATEGORY']+list(range(0,2500))
    full_matrix=pd.DataFrame(index=index_names,columns=column_names)
    full_matrix.fillna(value=0,inplace=True)
    for i in range(sparse_matrix.shape[0]):
        doc_nr=sparse_matrix[i][doc_idx]
        word_id=sparse_matrix[i][word_idx]
        label=sparse_matrix[i][cat_idx]
        occurence=sparse_matrix[i][freq_idx]
        full_matrix.at[doc_nr,'DOC_ID']=doc_nr
        full_matrix.at[doc_nr,'CATEGORY']=label
        full_matrix.at[doc_nr,word_id]=occurence
    full_matrix.set_index(['DOC_ID'],inplace=True)
    
    return full_matrix

In [34]:
%time
full_train_data=make_full_matrix(sparse_train_data,2500)

Wall time: 0 ns


In [41]:
full_test_data=make_full_matrix(sparse_test_data,2500)
X_test=full_test_data.loc[:,full_test_data.columns!="CATEGORY"].to_numpy()
y_test=full_test_data.CATEGORY.to_numpy()

### Mathematical Calcuation's For Prediction

In [35]:
full_features=full_train_data.loc[:,full_train_data.columns!='CATEGORY']
email_lengths=full_features.sum(axis=1)
total_wc=email_lengths.sum()
spam_wc=email_lengths[full_train_data.CATEGORY==1].sum()
ham_wc=email_lengths[full_train_data.CATEGORY==0].sum()

In [36]:
trained_spam_tokens=full_features[full_train_data.CATEGORY==1]
summed_span_tokens=trained_spam_tokens.sum(axis=0)+1
trained_ham_tokens=full_features[full_train_data.CATEGORY==0]
summed_han_tokens=trained_ham_tokens.sum(axis=0)+1

### Probability Of Tokens

In [37]:
prob_spam_tokens=summed_span_tokens/spam_wc
prob_ham_tokens=summed_han_tokens/ham_wc
prob_tokens_all=full_features.sum(axis=0)/total_wc


prob_token_spam=prob_spam_tokens.to_numpy()
prob_token_ham=prob_ham_tokens.to_numpy()
prob_token_all=prob_tokens_all.to_numpy()

In [43]:
prob_spam=full_train_data.CATEGORY.sum()/full_train_data.CATEGORY.size
print(prob_spam)
PROB_SPAM=0.3116

0.3109118086696562


In [44]:
joint_log_spam=X_test.dot(np.log(prob_token_spam)-np.log(prob_all_tokens))+np.log(PROB_SPAM)
joint_log_ham=X_test.dot(np.log(prob_token_ham)-np.log(prob_all_tokens))+np.log(1-PROB_SPAM)

### Predictions

In [45]:
prediction=joint_log_spam>joint_log_ham

In [47]:
prediction[:5]*1

array([1, 1, 1, 0, 0])