In [9]:
import pandas as pd

# Load the dataset
data = pd.read_csv('Phishing_Email.csv')
print(data.head())

   Unnamed: 0                                         Email Text  \
0           0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1           1  the other side of * galicismos * * galicismo *...   
2           2  re : equistar deal tickets are you still avail...   
3           3  \nHello I am your hot lil horny toy.\n    I am...   
4           4  software at incredibly low prices ( 86 % lower...   

       Email Type  
0      Safe Email  
1      Safe Email  
2      Safe Email  
3  Phishing Email  
4  Phishing Email  


In [10]:

#Preprocess the Data


def preprocess_data(data):
    data = data.dropna()  # Remove rows with missing values
    data.loc[:, 'Email Text'] = data['Email Text'].str.lower()  # Convert text to lowercase
    return data

processed_data = preprocess_data(data)
print(processed_data)

       Unnamed: 0                                         Email Text  \
0               0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1               1  the other side of * galicismos * * galicismo *...   
2               2  re : equistar deal tickets are you still avail...   
3               3  \nhello i am your hot lil horny toy.\n    i am...   
4               4  software at incredibly low prices ( 86 % lower...   
...           ...                                                ...   
18645       18646  date a lonely housewife always wanted to date ...   
18646       18647  request submitted : access request for anita ....   
18647       18648  re : important - prc mtg hi dorn & john , as y...   
18648       18649  press clippings - letter on californian utilit...   
18649       18650                                              empty   

           Email Type  
0          Safe Email  
1          Safe Email  
2          Safe Email  
3      Phishing Email  
4      Phishing

In [16]:

#Split the data


from sklearn.model_selection import train_test_split

X = processed_data['Email Text']
y = processed_data['Email Type']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.model_selection import train_test_split

# Assume processed_data is the DataFrame containing your cleaned data
x = processed_data['Email Text']  # Features (email content)
y = processed_data['Email Type']   # Labels (email classification) & # This is the column with labels : Safe Email vs Phishing Email

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
#0.2 means 20% for testing and 80% for training 
#Ensure that the split will be reproducible so everytime run the code , same seed is running, so get the same training and testing datasets.
#like APS105 having the same random by choising the exact same seed



# Optional: Print the shapes of the resulting datasets to verify
print(f"Training data shape: {x_train.shape}, Training labels shape: {y_train.shape}")
print(f"Testing data shape: {x_test.shape}, Testing labels shape: {y_test.shape}")


Training data shape: (14907,), Training labels shape: (14907,)
Testing data shape: (3727,), Testing labels shape: (3727,)


In [20]:

#Convert the text data into a numerical format that can be used for the model

from sklearn.feature_extraction.text import TfidfVectorizer 
#Imported the library to convert the text data into numerical form

vectorizer = TfidfVectorizer()
#Created a TfidVectorizer OBJECT 

#function fit = Analyze the training data by learning the vocabulary and calcu;ates the TF-IDF weights
#function transfrom = Applies the learned vocabulary & weights to convert the text data into a numerical matrix 
#                     The matrix had row=email column=word's TF IDF score
#For the test data, we only have to transform it because we do not want to learn from th test data but only the trainin one 
#but still convert it has a matrix 
x_train_tfidf = vectorizer.fit_transform(x_train) #x_train_tfidf is the resulting matrix with the [email, score]
x_test_tfidf = vectorizer.transform(x_test) 

#print(x_train_tfidf)
#print(x_test_tfidf)

"""
What is the TF-IDF
It is the Term-Frequency(TF) and Inverse Document Frequency (IDF) Calculation

A High TF-IDF means the word appears frequently in a specific document but not compare to ther others so the word will be 
more important and carry meaningful context that can help differentiate one classe from another (Phishing vs Safe Email)

TF = measures how often a word appears in a document
High = word is significant for the particular document

IDF = measures how common or rare a word is across all documents
If word appears in many docs than IDf value decreases vs is word is rare in the dataset, IDF value if high

So having a high TF-IDF are good for classification

"""



'\nWhat is the TF-IDF\nIt is the Term-Frequency(TF) and Inverse Document Frequency (IDF) Calculation\n\nA High TF-IDF means the word appears frequently in a specific document but not compare to ther others so the word will be \nmore important and carry meaningful context that can help differentiate one classe from another (Phishing vs Safe Email)\n\nTF = measures how often a word appears in a document\nHigh = word is significant for the particular document\n\nIDF = measures how common or rare a word is across all documents\nIf word appears in many docs than IDf value decreases vs is word is rare in the dataset, IDF value if high\n\nSo having a high TF-IDF are good for classification\n\n'

In [21]:

#Train the model

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train_tfidf, y_train)




In [22]:

#Evaluate the model

from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(x_test_tfidf)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

Accuracy: 0.8921384491548162
                precision    recall  f1-score   support

Phishing Email       0.97      0.76      0.85      1518
    Safe Email       0.86      0.98      0.92      2209

      accuracy                           0.89      3727
     macro avg       0.91      0.87      0.88      3727
  weighted avg       0.90      0.89      0.89      3727



In [23]:

#Save the model using Pickle


#you can serialize (convert an object into a byte stream) and deserialize (convert a byte stream back into an object) Python ojects
#Useful for trained machine learning mdoels so you can load them later without having to retrain them
import pickle

#wb means open the file in 'Write Bianry Mode'
with open('phishing_detection_model.pkl', 'wb') as file:
    pickle.dump(model, file)

#pickle dump takes the object to serialize and the second argument is the file object where the serialize code will be written
#in this case it is phishing_detection_model.pkl

# Save the vectorizer (since you'll need the same TF-IDF settings for testing)
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)
    

In [24]:


#Loading the Saved Model:

#The function here open the model.pkl files and read the bianry code 'rb' (expected to see a serialized (pickled) version
#pickle.load will load this file by reading the by stream and deserializes it back into a Python object 
#assigned in the varibale on the left

#same thing with the vectorizer

#So now we can use them again by loaded_vectorizer to transform a text of new emails into a TF-IDF format 
#and the loaded_model to predict whether these emails are phishing or safe.

# Load the model
with open('phishing_detection_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Load the vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_vectorizer = pickle.load(file)



In [27]:
# Testing the Model with New Data

def predict_email(email_text):
    # Preprocess the email (convert to lowercase)
    email_text = email_text.lower()

    # Transform the email using the loaded vectorizer
    email_tfidf = loaded_vectorizer.transform([email_text])

    # Predict using the loaded model
    prediction = loaded_model.predict(email_tfidf)
    return prediction[0]

# Example: Testing with a new email
new_email = "Sign up now to get a FREE prize! Click here: http://example.com"
print(f'Prediction: {predict_email(new_email)}')  # Output will be 'Safe Email' or 'Phishing Email'


Prediction: Phishing Email
