In [2]:

import pandas as pd
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import VotingClassifier
import seaborn as sns
from matplotlib import rcParams
import warnings
warnings.filterwarnings('ignore')
 

In [3]:
 
class Data_Preprocessing:
    emails= pd.DataFrame()
    
    def __init__(self):
        print('Object created......Data Preprocessing starts')
        print('----------------------------------------------------------------')
    
    def read_data(self,input_dataset):
        global emails
        
        print('Reading Data from the csv file')
        emails= pd.read_csv(input_dataset, encoding='latin-1')
        
        print('Prints the first 5 rows of the dataframe')
        print(emails.head())
        print('----------------------------------------------------------------')
        
        print('Number of emails in each label')
        print(emails.Label.value_counts())
        print('----------------------------------------------------------------')
        
        print('A copy of the Email content is created')
        text_feat= emails['Email'].copy()
        print('----------------------------------------------------------------')
        
        print('Calling the text_process function to remove punctuation and stopwords.')
        print('----------------------------------------------------------------')
        print('This might take few minutes')
        text_feat= text_feat.apply(self.text_process)
        print('\n')
        print(text_feat.head())
        
        return text_feat

    
    def text_process(self, text):
        #the text is translated by replacing empty string wth empty string and deleting all the characters found in string.punctuation
        text= text.translate(str.maketrans('','',string.punctuation))
        text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
        return " ".join(text)
    
    def stemmer(self, text):
        #stemming of content
        text = text.split()
        words = ""
        for i in text:
                stemmer = SnowballStemmer("english")
                words += (stemmer.stem(i))+" "
        return words
        
    def feature_creation(self, text_feat):
        print('Initialize the TfIdfVectorizer')
        vectorizer= TfidfVectorizer(stop_words='english')
        features = vectorizer.fit_transform(text_feat)
        print('***********Features created successfully*******************')
        print('--------------------------------')
        print('Features: ', features.shape)
        print('\n')
        return features
    
    def featcreation_countvector(self, text_feat):
        print('Initialize the count vector')
        vector_count= CountVectorizer()
        features_count= vector_count.fit_transform(text_feat)
        print('Features using count vectorizer created successfully')
        print('----------------------------------')
        print('Features_count: ', features_count.shape)
        print('\n')
        return features_count
                
    def split_train_test(self, features):
        global emails
        features_train, features_test, labels_train, labels_test = train_test_split(features, emails['Label'], test_size=0.3, random_state=111)
        print('Features_train: ', features_train.shape)
        print('Features_test: ', features_test.shape)
        print('Labels_train: ', labels_train.shape)
        print('Labels_test: ', labels_test.shape)
        print('\n')
        return features_train, features_test, labels_train, labels_test


In [4]:
def text_process_one_email(self, text):
        #the text is translated by replacing empty string wth empty string and deleting all the characters found in string.punctuation
        text= text.translate(str.maketrans('','',string.punctuation))
        text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
        return text

In [8]:

input_dataset='./Datasets/final_dataset.csv'


In [9]:

#Creatig object of class Data_Preprocessing
Processed_dataset= Data_Preprocessing()


Object created......Data Preprocessing starts
----------------------------------------------------------------


In [10]:

#Read data from the csv file, then removal of stopwords and punctuation
text_feat= Processed_dataset.read_data(input_dataset)

Reading Data from the csv file
Prints the first 5 rows of the dataframe
                                               Email   Label  Length
0   cvs of candidates for rac support role  these...  NORMAL     126
1   http : / / www . joelpittet . com  hello ,  i...    SPAM     940
2   market internet access - no investment needed...    SPAM     294
3    Greetings!!   I come to you with a sincere h...   FRAUD    2878
4   the best possible mortgage  has your mortgage...    SPAM     568
----------------------------------------------------------------
Number of emails in each label
NORMAL    1000
SPAM      1000
FRAUD     1000
Name: Label, dtype: int64
----------------------------------------------------------------
A copy of the Email content is created
----------------------------------------------------------------
Calling the text_process function to remove punctuation and stopwords.
----------------------------------------------------------------
This might take few minutes


0    cvs can

In [31]:
def text_process(text):
    #the text is translated by replacing empty string wth empty string and deleting all the characters found in string.punctuation
    text= text.translate(str.maketrans('','',string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return " ".join(text)

In [32]:
import pickle
vectorizer= TfidfVectorizer(stop_words='english')
vectorizer.fit_transform(text_feat)

abc_model = pickle.load(open('model/abc.pkl', 'rb'))
bc_model = pickle.load(open('model/bc.pkl', 'rb'))
dtc_model = pickle.load(open('model/dtc.pkl', 'rb'))
etc_model = pickle.load(open('model/etc.pkl', 'rb'))
knc_model = pickle.load(open('model/knc.pkl', 'rb'))
mnb_model = pickle.load(open('model/mnb.pkl', 'rb'))
rfc_model = pickle.load(open('model/rfc.pkl', 'rb'))
svc_model = pickle.load(open('model/svc.pkl', 'rb'))
lrc_model = pickle.load(open('model/lrc.pkl', 'rb'))

In [None]:
X = input("Type anything")
X = text_process(X)
Y =  vectorizer.transform([X])

# AdaBoost Classifier
predict_out_abc = abc_model.predict(Y)
# Bagging Classifier
predict_out_bc = bc_model.predict(Y)
# Decision Tree Classifier
predict_out_dtc = dtc_model.predict(Y)
# ExtraTrees Classifier
predict_out_etc = etc_model.predict(Y)
# K-Nearest Neighbour
predict_out_knc = knc_model.predict(Y)
# Multinomial Naive Bayes
predict_out_mnb = mnb_model.predict(Y)
# Random Forest Classifier
predict_out_rfc = rfc_model.predict(Y)
# Support Vector Machine
predict_out_svc = svc_model.predict(Y)
# Logistic Regression
predict_out_lrc = lrc_model.predict(Y)

print(predict_out_abc)
print(predict_out_bc)
print(predict_out_dtc)
print(predict_out_etc)
print(predict_out_knc)
print(predict_out_mnb)
print(predict_out_rfc)
print(predict_out_svc)
print(predict_out_lrc)