## Importation des Librairies:

In [58]:
import numpy as np
import pandas as pd
from os import walk #generate the file names in a directory tree
#it returns a tuple of (root => return directories  ,dirs=> sub directories from the root , files=> files from the subdir specified )
from os.path import join
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer #lib used to reduce a word to its base word

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, roc_auc_score
import re #regular expression module 
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score,precision_score,recall_score

from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

In [59]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Afin de recuperer les exemples demail , on precise le path des deux types:

In [60]:

easy_ham_path="/content/drive/MyDrive/Colab Notebooks/Projet/easy_ham" #set the path for hams
spam_path="/content/drive/MyDrive/Colab Notebooks/Projet/spam" #set the path for spams

spam_class = 1  # spam class 
ham_class = 0# ham class


## Recuperation du body seulement:
Le corp d'un mail se trouve habituellement apres le premier saut de ligne , c'est ce que nous allons utiliser pour le recuperer

In [61]:
#for each email get the body only  
def get_email_body(path):
    #for each file in the directory 
    for root, dirs, files in walk(path):
        for file in files:
            #for each file joining the root with its name so that we can have the full path
            filepath = join(root, file)
            #set encoding to latin-1  
            stream = open(filepath, encoding='latin-1') # reading the file 
            is_body = False
            lines = []
            #the body starts after a '\n'
            for line in stream:
                if is_body:
                    lines.append(line) #starts getting the text only after '\n' 
                elif line == '\n':
                    is_body = True

            email_body = '\n'.join(lines)
            yield file, email_body


## Regrouper les mails dans une seule dataframe avec leurs classes

In [62]:
# put everything in a dataframe (array)
def group_emails_df(path, classification):
    body = []
    name = []
#saving the body of each email in a dataframe (indexes=names) 
    for file, email_body in get_email_body(path):
        body.append({'Body':email_body, 'Class': classification})
        name.append(file)
    return pd.DataFrame(body, index = name)#refer to each email body by the file name

In [63]:
#Spam Folders
spam_emails = group_emails_df(spam_path,spam_class)
#Ham Folders
ham_emails = group_emails_df(easy_ham_path,ham_class)

In [64]:
#print to see one email only 
#one_email=spam_emails.loc['0001.bfc8d64d12b325ff385cca8d07b84288']
#one_email.Body

In [65]:
data = pd.concat([spam_emails,ham_emails])

## Partie NLP:
Nettoyage des emails.

In [66]:
#Nettoyage d’emails

stemmer = PorterStemmer()
def clean_text(email):
    documents = []
    stemmer = PorterStemmer()
    snow_stemmer = SnowballStemmer(language='english')

 
    for i in range(0,len(email)):
         #get rid of all HTML tags by extracting the text only
        soup = BeautifulSoup(email[i], 'html.parser') 
        content = soup.get_text()
        
         # Converting to Lowercase
        content = content.lower()
        #replace URLs
        content= re.sub('(http|https)://[^\s]*', 'httpaddr', content)
        
        #replace email adresses
        content= re.sub('[^\s]+@[^\s]+', 'emailaddr', content)
        
        #replace numbers
        content= re.sub('[0-9]+', 'number',content)
        # replace dollar sign
        content=  re.sub('[$]+', 'dollar', content)
        
        # Remove all the special characters
        content = re.sub(r'\W+', ' ', str(content))
        content = re.sub(r"[^a-zA-Z0-9]+", ' ', content)

        # remove all single characters
        content = re.sub(r'\s+[a-zA-Z]\s+', ' ',  content)

        # Remove single characters from the start
        content = re.sub(r'\^[a-zA-Z]\s+', ' ', content) 

        # repalce multiple spaces with single space
        content = re.sub(r'\s+', ' ', content, flags=re.I)

        # Lemmatization
        content = content.split()
        #content = [snow_stemmer.stem(word) for word in content]
        content = [stemmer.stem(word) for word in content]
        content = ' '.join(content)
        
     
        

        documents.append(content)
    return documents

In [84]:


Msg = clean_text(data.Body)
data.Body = Msg



## Traitement et vectorisation des emails:

In [68]:
def create_vocabList(num):
    vocab={}
    for content in data[data['Class']==spam_class].Body:
        words= content.split()
        for word in words:
            if word in vocab:
                vocab[word]=vocab[word]+1
            else :
                vocab[word]=1

    stream= open('vocab.txt','w')
    for word in vocab:
        if (vocab[word] >= num ):
            stream.write(word)
            stream.write(" ")
    stream.close  
            
            
    

In [69]:
def getVocabList():
    """
    Reads the fixed vocabulary list in vocab.txt
    and returns a dictionary of the words in vocabList.
    """
  
    with open('/content/drive/MyDrive/Colab Notebooks/Projet/vocab.txt', 'r') as vocab:
        
        # Store all dictionary words in dictionary vocabList.
        vocabList = {}
        for line in vocab.readlines():
            words = line.split()
            for w in words:
                vocabList[w] = words.index(w)

    return vocabList

In [70]:
#get the vocab list 
#split the email into an array of words
#for each word , if it exists in vocab list then store its index in the word_indices array 
def tokenize(email):
    vocabList = getVocabList()
    email_words= email.split()
    word_indices = []
    for word in email_words:
        if word in vocabList:
            idx=vocabList[word]
            word_indices.append(idx)
    return word_indices
        
        

In [71]:

def extraction(email):
    vocabList = getVocabList()
    n= len(vocabList)
    x = np.zeros((1, n))

    word_indices= tokenize(email)
    for idx in word_indices:
    # Assign 1 to index idx in x.
        x[0][idx] = 1

    return x



In [72]:
#create a vocab.txt file containing only the words repeated more than a 100 times in spam emails 
create_vocabList(100)

In [73]:
#get all the words from vocab.txt
vocabList= getVocabList()
idc=tokenize(data.loc['0001.bfc8d64d12b325ff385cca8d07b84288'].Body)
x=extraction(data.loc['0001.bfc8d64d12b325ff385cca8d07b84288'].Body)
len(vocabList)

266

In [74]:
#turn all emails into vectors 
def all_emails_extra(data):
    mails=[]
    for i in range(0,len(data)):
        x= extraction(data[i])
        #print(x)
        mails.append(x)
    y=np.array(mails)
    y= np.asmatrix(y)
   # print(y)
    return y
x=all_emails_extra(data.Body)

In [75]:

print(x.shape)



(3052, 266)


## Partie Tests:
Ici nous allons tester avec les differents models vu durant le semestre. 

In [76]:
#Split data
X_train, X_test,y_train,y_test = train_test_split(x,data.Class, test_size =0.3, random_state=0)

#X_test.shape



## SVM

In [77]:
C = 0.1
#y = y.ravel()
clf = SVC(kernel='linear').fit(X_train, y_train)
y_predicted_SVM = clf.predict(X_test)

scores_SVM = precision_recall_fscore_support(y_test,y_predicted_SVM)
print("Accuracy of SVM is:",round(clf.score(X_test,y_test),4))
print('F1 score::',f1_score(y_test, y_predicted_SVM))
print('precision score:',precision_score(y_test, y_predicted_SVM))
print('recall_score:',recall_score(y_test, y_predicted_SVM))

Accuracy of SVM is: 0.9716
F1 score:: 0.9121621621621622
precision score: 0.9246575342465754
recall_score: 0.9




In [None]:
fpr, tpr, thresholds = roc_curve(test_y, test_prediction_scores)


# MLPClassifier

In [78]:
from sklearn.neural_network import MLPClassifier


In [79]:
NN = MLPClassifier()
NN.fit(X_train, y_train)
y_pred = NN.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)*100
confusion_mat = confusion_matrix(y_test,y_pred)
print("Accuracy for Neural Network is:",accuracy)
print("Confusion Matrix")
print(confusion_mat)
print('F1 score::',f1_score(y_test, y_pred))
print('precision score:',precision_score(y_test, y_pred))
print('recall_score:',recall_score(y_test, y_pred))






Accuracy for Neural Network is: 97.27074235807859
Confusion Matrix
[[755  11]
 [ 14 136]]
F1 score:: 0.9158249158249158
precision score: 0.9251700680272109
recall_score: 0.9066666666666666




# Gaussian Naive Bayes

In [80]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print(gnb.score(X_test,y_test))
print('F1 score::',f1_score(y_test, y_pred))
print('precision score:',precision_score(y_test, y_pred))
print('recall_score:',recall_score(y_test, y_pred))

0.8897379912663755
F1 score:: 0.705539358600583
precision score: 0.6269430051813472
recall_score: 0.8066666666666666




In [83]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
y_pred = mnb.fit(X_train, y_train).predict(X_test)
print(mnb.score(X_test,y_test))
print('F1 score::',f1_score(y_test, y_pred))
print('precision score:',precision_score(y_test, y_pred))
print('recall_score:',recall_score(y_test, y_pred))

0.9596069868995634
F1 score:: 0.8802588996763754
precision score: 0.8553459119496856
recall_score: 0.9066666666666666




## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lReg = LinearRegression().fit(X_train, y_train)
lReg.score(X_test, y_test)




0.6579639689805565

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred=clf.predict(X_test)
print('F1 score::',f1_score(y_test, y_pred))
print('precision score:',precision_score(y_test, y_pred))
print('recall_score:',recall_score(y_test, y_pred))
clf.score(X_test, y_test)


F1 score:: 0.9278350515463918
precision score: 0.9574468085106383
recall_score: 0.9




0.9770742358078602