# Email dataset 

# Importing the Libraries for ML work 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download stopwords if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data=pd.read_csv('D:\Model_Deployement\Dataset\Phishing_Email.csv',index_col=False)

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [5]:
record=data['Email Text'][0]

In [8]:
data.isna().sum()

Unnamed: 0    0
Email Text    0
Email Type    0
dtype: int64

In [7]:
# replace the na with empty space
data['Email Text'].fillna('',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Email Text'].fillna('',inplace=True)


Apply the preprocessing on the text data for more refining the feature 

In [9]:
# function for clean the data 
# function for clean the data 
ps=PorterStemmer()

def clean(data):
    data = re.sub(r'[^a-zA-Z\s]', '', data)
    data = data.lower()
    stop_words = set(stopwords.words('english'))
    words = data.split()
    words = [word for word in words if word not in stop_words]
    words = [ps.stem(word) for word in words]
    return words  # returns a list of tokens
    

In [10]:
cleaned_record=clean(record)

In [11]:
# applying the clean data 
cleaned_record

['disc',
 'uniformitarian',
 'sex',
 'lang',
 'dick',
 'hudson',
 'observ',
 'us',
 'use',
 'aughter',
 'voc',
 'thoughtprovok',
 'sure',
 'fair',
 'attribut',
 'son',
 'treat',
 'like',
 'senior',
 'rel',
 'one',
 'thing',
 'nt',
 'normal',
 'use',
 'brother',
 'way',
 'aughter',
 'hard',
 'imagin',
 'natur',
 'class',
 'compris',
 'senior',
 'rel',
 'exclud',
 'brother',
 'anoth',
 'seem',
 'differ',
 'imagin',
 'distinct',
 'seem',
 'senior',
 'rel',
 'term',
 'use',
 'wider',
 'varieti',
 'context',
 'e',
 'g',
 'call',
 'distanc',
 'get',
 'someon',
 'attent',
 'henc',
 'begin',
 'utter',
 'wherea',
 'seem',
 'natur',
 'utter',
 'like',
 'ye',
 'son',
 'hand',
 'son',
 'one',
 'like',
 'son',
 'son',
 'help',
 'although',
 'perhap',
 'latter',
 'one',
 'complet',
 'imposs',
 'alexi',
 'mr']

In [None]:
# Apply the clean function to all emails to get tokens
data['tokens'] = data['Email Text'].apply(clean)

In [None]:

# Train Word2Vec model on the tokenized emails
w2v_model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to get average vector for an email
def get_email_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Apply the vectorization to each email
sentences = data['tex'].tolist()
data['email_vector'] = data['tokens'].apply(lambda x: get_email_vector(x, w2v_model))

# Show the first email's vector
data['email_vector'].iloc[0]

array([-0.6678477 ,  0.22126657,  0.21499443,  0.09209052,  0.2987265 ,
       -0.7009883 , -0.01120482,  0.34883332, -0.3345779 ,  0.25105402,
        0.04017329, -0.30243143,  0.36628243,  1.0737046 , -0.04614336,
       -0.9678252 ,  0.21168451, -0.5687821 , -0.06632439, -0.9823827 ,
        0.1740822 ,  0.40818503,  0.12804006, -0.6247795 , -0.44254443,
       -0.33016866, -0.20093495, -0.1483393 , -0.7685666 , -0.62716454,
       -0.42291382,  0.5179392 , -0.25876224,  0.27129355, -0.30164707,
        0.5108289 , -0.16620749, -0.34609273, -0.4565303 , -0.8155032 ,
       -0.31691018, -0.2911172 ,  0.13368687,  0.50038874,  0.99702036,
        0.57966775, -0.55134386,  0.5217776 ,  0.66396356,  0.5022494 ,
       -0.49617875, -0.58116686,  0.1696924 ,  0.2763639 , -0.94695497,
        0.21582384,  0.18826862, -0.24038428,  0.15748498, -0.37939388,
        1.2337221 ,  0.15604076, -0.22079156, -0.71186864, -0.64174193,
        0.11655094, -0.63188636,  0.00459586, -1.1010785 ,  0.30

# Target feature 

Data set is silghtly imbalannced as the number of the phising mail are very less compared to the safe emails


In [18]:
data['Email Type'].value_counts(normalize=True)

Email Type
Safe Email        0.607078
Phishing Email    0.392922
Name: proportion, dtype: float64

In [None]:

encoder = OneHotEncoder(sparse_output=False)
email_type_encoded = encoder.fit_transform(data[['Email Type']])

# Convert to DataFrame for easy viewing
email_type_encoded_df = pd.DataFrame(email_type_encoded, columns=encoder.get_feature_names_out(['Email Type']))

# Concatenate with the original data if needed
data = pd.concat([data.reset_index(drop=True), email_type_encoded_df.reset_index(drop=True)], axis=1)

# Show the result
data.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type,tokens,email_vector,Email Type_Phishing Email,Email Type_Safe Email
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,"[disc, uniformitarian, sex, lang, dick, hudson...","[-0.6678477, 0.22126657, 0.21499443, 0.0920905...",0.0,1.0
1,1,the other side of * galicismos * * galicismo *...,Safe Email,"[side, galicismo, galicismo, spanish, term, na...","[-0.7129134, 0.6111404, 0.38503578, -0.0425917...",0.0,1.0
2,2,re : equistar deal tickets are you still avail...,Safe Email,"[equistar, deal, ticket, still, avail, assist,...","[-0.90638566, 0.925917, -0.4976956, 1.7943083,...",0.0,1.0
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,"[hello, hot, lil, horni, toy, one, dream, open...","[-0.4146968, 0.12298537, -0.43686053, 0.212549...",1.0,0.0
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email,"[softwar, incred, low, price, lower, draperi, ...","[-0.66229075, 0.42618254, 0.044395894, 0.27359...",1.0,0.0


In [20]:
data.drop(columns=['Email Type_Safe Email'],axis=1,inplace=True)

In [21]:
data.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type,tokens,email_vector,Email Type_Phishing Email
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,"[disc, uniformitarian, sex, lang, dick, hudson...","[-0.6678477, 0.22126657, 0.21499443, 0.0920905...",0.0
1,1,the other side of * galicismos * * galicismo *...,Safe Email,"[side, galicismo, galicismo, spanish, term, na...","[-0.7129134, 0.6111404, 0.38503578, -0.0425917...",0.0
2,2,re : equistar deal tickets are you still avail...,Safe Email,"[equistar, deal, ticket, still, avail, assist,...","[-0.90638566, 0.925917, -0.4976956, 1.7943083,...",0.0
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,"[hello, hot, lil, horni, toy, one, dream, open...","[-0.4146968, 0.12298537, -0.43686053, 0.212549...",1.0
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email,"[softwar, incred, low, price, lower, draperi, ...","[-0.66229075, 0.42618254, 0.044395894, 0.27359...",1.0


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18650 entries, 0 to 18649
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 18650 non-null  int64  
 1   Email Text                 18650 non-null  object 
 2   Email Type                 18650 non-null  object 
 3   tokens                     18650 non-null  object 
 4   email_vector               18650 non-null  object 
 5   Email Type_Phishing Email  18650 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 874.3+ KB


In [23]:
data.rename(columns={'email_vector':'Vector Embedding of email','Email Type_Phishing Email':'Email Type-safe-1_phishing-0'},inplace=True)

In [24]:
data.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type,tokens,Vector Embedding of email,Email Type-safe-1_phishing-0
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,"[disc, uniformitarian, sex, lang, dick, hudson...","[-0.6678477, 0.22126657, 0.21499443, 0.0920905...",0.0
1,1,the other side of * galicismos * * galicismo *...,Safe Email,"[side, galicismo, galicismo, spanish, term, na...","[-0.7129134, 0.6111404, 0.38503578, -0.0425917...",0.0
2,2,re : equistar deal tickets are you still avail...,Safe Email,"[equistar, deal, ticket, still, avail, assist,...","[-0.90638566, 0.925917, -0.4976956, 1.7943083,...",0.0
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,"[hello, hot, lil, horni, toy, one, dream, open...","[-0.4146968, 0.12298537, -0.43686053, 0.212549...",1.0
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email,"[softwar, incred, low, price, lower, draperi, ...","[-0.66229075, 0.42618254, 0.044395894, 0.27359...",1.0


# Feature and Target vector 

In [25]:
Feature = data['Vector Embedding of email']
Target=data['Email Type-safe-1_phishing-0']

# Save the feature and targer data

In [26]:
data.to_csv('Processed_data.csv')

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18650 entries, 0 to 18649
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    18650 non-null  int64  
 1   Email Text                    18650 non-null  object 
 2   Email Type                    18650 non-null  object 
 3   tokens                        18650 non-null  object 
 4   Vector Embedding of email     18650 non-null  object 
 5   Email Type-safe-1_phishing-0  18650 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 874.3+ KB


In [28]:
# train test spilt
X_train,X_test,y_train,y_test=train_test_split(Feature,Target,train_size=0.8,shuffle=True)

In [29]:
X_train.shape

(14920,)

In [30]:
X_test.shape

(3730,)

In [31]:
X_train = np.vstack(X_train)
X_test= np.vstack(X_test)


In [32]:
X_train

array([[-0.73934144,  1.65815508,  0.37823188, ..., -1.05790412,
        -0.34428424,  0.58607697],
       [-0.44345942,  0.40076584,  0.11992031, ..., -1.03830945,
         0.10230283,  0.26108736],
       [-0.08985421,  0.11782513,  0.13377899, ..., -0.22693452,
        -0.01546864,  0.02803038],
       ...,
       [-0.89855182,  0.98663414,  0.69710547, ..., -0.34088913,
         0.83277804,  0.26008883],
       [-0.75611758,  0.98667288,  1.08522797, ..., -0.57572919,
         0.58420241, -0.21412207],
       [-0.07135737,  0.24142891,  0.01019724, ..., -0.42663091,
        -0.20971957,  0.05674595]])

In [36]:
import pickle
def load_model(file):
    with open(file,'rb') as f:
        model=pickle.load(f)

    return model

xgmodel=load_model('Final_model\model.pkl')

In [47]:
type(X_test[0])

numpy.ndarray

In [43]:
from sklearn.metrics import classification_report
y_pred=xgmodel.predict(X_test)
report=classification_report(y_test,y_pred,output_dict=True)
print(f"{report['accuracy']*100} %")

95.09383378016085 %


In [54]:
X_test[0].shape

(100,)

# Training the model on the dataset with Hyperparameter tuning 

In [None]:
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import mlflow
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import mlflow.sklearn

In [None]:
models={
    'catboost': {
        'model': CatBoostClassifier(
            iterations=200,
            learning_rate=0.05,
            depth=5,
            verbose=False,
            random_state=42
        ),
        'params': {
            'iterations': [100, 200],
            'learning_rate': [0.01, 0.05, 0.1],
            'depth': [3, 5, 7]
        }
    },
    'lightgbm': {
        'model': LGBMClassifier(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=5,
            random_state=42
        ),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7],
            'num_leaves': [15, 31, 63]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [3, 5],
            'learning_rate': [0.1]
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [4, 8]
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1],
            'kernel': ['linear']
        }
    },
    'Decision_Tree':{
        'model':DecisionTreeClassifier(),
        'params':{
            'max_depth':[5,8],
        }
    },
   'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [0.01, 0.1, 1],
            'penalty': ['l2'],
            'solver': ['lbfgs']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    }
}

In [None]:

mlflow.set_experiment('Hyper-Parametertuning of Classifier_with_Metrices')
mlflow.set_registry_uri('http://127.0.0.1:5000/')

for model_name,clf_model in models.items():
    with mlflow.start_run(run_name=model_name+' Hyperparameter tuning'):
        classifier=GridSearchCV(clf_model['model'],clf_model['params'],cv=5,scoring='accuracy')
        classifier.fit(X_train,y_train)
        y_pred=classifier.predict(X_test)
        report=classification_report(y_test,y_pred,output_dict=True)
        mlflow.log_param("model_name",model_name)

        #log parameters
        for param_name, param_value in classifier.best_params_.items():
            mlflow.log_param(param_name,param_value)
        
        mlflow.log_metrics({
        'accuracy': report['accuracy'],
        'recall_class_0': report['0.0']['recall'],
        'recall_class_1': report['1.0']['recall'],
        'f1_score_macro': report['macro avg']['f1-score']
         })
        # Log best model
        mlflow.sklearn.log_model(classifier.best_estimator_, model_name + "_model")
        

        print(f"{model_name} logged in MLflow with accuracy: {classifier.best_score_:.4f}")

