In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer # 
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestClassifier # 
from sklearn.preprocessing import StandardScaler 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("phishing email detection system/phishing_email_main_columns.csv")

In [3]:
df.head(10)

Unnamed: 0,Sender_Name,Sender_Email,Recipient_Email,Subject,Email_Body,Timestamp,Has_Attachment,Attachment_Type,URL_Count,URL_Domains,Email_Client,Language,Label
0,Jason Lindsey,susanrobinson@yahoo.com,mjackson@burton.com,Particular travel land participant federal.,Could forward occur quickly tend today focus. ...,2025-03-27 07:36:36,True,zip,4,"oconnor.com, leon.org, scott-mckinney.org, mor...",Apple Mail,en,legitimate
1,Jessica Walter,harrisandrew@smith.org,sthompson@gmail.com,Matter because create later southern board one.,Apply she push loss right candidate attention....,2024-10-21 00:03:52,True,none,7,"simon-martinez.biz, french-holt.net, richmond....",Outlook,en,phishing
2,Willie Ingram,rmedina@johnson.com,gordonamy@hart.com,Piece phone tend form fish white.,Shoulder seek black wall. Have between minute ...,2024-11-15 04:20:56,True,none,9,"moses.info, joseph.com, cox.com, fernandez-bon...",Apple Mail,es,phishing
3,Misty Stevenson,leonallison@brown-anderson.net,catherine45@ramos.com,Music evening huge put.,Someone poor professor quite forward free. Las...,2024-11-02 09:17:45,False,none,8,"burns.net, liu.org, odom.net, evans.info, hern...",Thunderbird,en,phishing
4,Julian Cline,chavezbrandon@hotmail.com,ricekaitlyn@yahoo.com,I third them vote direction.,Group bit us force can true. Establish adult b...,2025-03-10 00:03:16,True,none,10,"mitchell.com, wolf-snow.info, fitzgerald.com, ...",Outlook,zh,legitimate
5,Steven Hernandez,rogersamanda@giles-bush.org,smithelaine@white.com,Thank college none race material.,Hear history another view memory you. Maintain...,2024-05-14 22:54:01,True,pdf,1,collins.com,Outlook,de,legitimate
6,Erika Washington,sodom@yahoo.com,joshua52@yahoo.com,Team less enough adult seven.,Every treat create room style green. Tough tri...,2024-11-25 14:25:41,False,none,8,"hartman.com, brown-palmer.net, callahan.info, ...",Thunderbird,fr,legitimate
7,Debbie Lopez,williamsanchez@hotmail.com,jennagentry@morse-lynch.info,Explain high actually guess lose almost.,Different force where nation. Modern relations...,2024-11-25 01:36:35,True,zip,10,"smith.info, lara-green.net, nichols.com, barbe...",Thunderbird,de,legitimate
8,Jacob Kent,brownrebecca@armstrong.biz,charlesgarcia@mitchell.com,Fall choice simply step protect.,Beyond official go impact half name condition....,2025-01-07 08:59:59,True,xlsx,0,,Thunderbird,en,phishing
9,Virginia Coleman,hayley17@hotmail.com,turnerangie@hotmail.com,Body reduce need wish.,Soon great about baby middle. Wind share inclu...,2024-11-18 21:42:22,False,none,9,"brown-young.com, reed-escobar.com, snyder.biz,...",Thunderbird,en,phishing


In [4]:
df.shape

(500, 13)

In [5]:
df.dtypes

Sender_Name        object
Sender_Email       object
Recipient_Email    object
Subject            object
Email_Body         object
Timestamp          object
Has_Attachment       bool
Attachment_Type    object
URL_Count           int64
URL_Domains        object
Email_Client       object
Language           object
Label              object
dtype: object

In [6]:
# Checking the null value

In [7]:
df.isnull().sum()

Sender_Name         0
Sender_Email        0
Recipient_Email     0
Subject             0
Email_Body          0
Timestamp           0
Has_Attachment      0
Attachment_Type     0
URL_Count           0
URL_Domains        46
Email_Client        0
Language            0
Label               0
dtype: int64

In [8]:
# Remove rows where 'domain' is null
df = df.dropna(subset=['URL_Domains'])


In [9]:
df['URL_Domains'] = df['URL_Domains'].fillna('missing')

print(df['URL_Domains'].isnull().sum())


0


In [10]:
# Feature Extraction 

### Email Header Feature Extraction

In [11]:
import socket
import tldextract
import ipwhois

# Extract domain from 'From' field
df['domain'] = df['Sender_Email'].apply(lambda x: tldextract.extract(x).registered_domain)

# Get IP address of sender domain (could fail if DNS not resolvable)
def get_ip(domain):
    try:
        return socket.gethostbyname(domain)
    except:
        return None

df['sender_ip'] = df['domain'].apply(get_ip)

# Get domain age using whois (optional - can be slow)
def get_domain_age(domain):
    try:
        w = whois.whois(domain)
        return (pd.Timestamp.now() - pd.to_datetime(w.creation_date)).days
    except:
        return None

df['domain_age_days'] = df['domain'].apply(get_domain_age)


In [12]:
# URL Feature Extraction

In [13]:
import re
from urllib.parse import urlparse

# Extract URLs from email body
url_pattern = r'(https?://[^\s]+)'
df['urls'] = df['Email_Body'].apply(lambda x: re.findall(url_pattern, str(x)))

# Count number of URLs
df['num_urls'] = df['urls'].apply(len)

# Check for presence of suspicious domains
suspicious_keywords = ['bit.ly', 'tinyurl', 'paypal', 'login', 'verify']
df['suspicious_urls'] = df['urls'].apply(lambda urls: any(any(k in u for k in suspicious_keywords) for u in urls))


In [14]:
# Text-Based Feature Engineering (TF-IDF & Keywords)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Identify phishing keywords
phishing_keywords = ['urgent', 'verify', 'click here', 'limited time', 'account', 'password']
df['has_phishing_keywords'] = df['Email_Body'].apply(lambda x: any(k in str(x).lower() for k in phishing_keywords))

# Apply TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=100)
tfidf_matrix = vectorizer.fit_transform(df['Email_Body'].fillna(""))

# Convert TF-IDF features to DataFrame and merge
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)


In [16]:
# Email Authentication Check (SPF, DKIM, DMARC)

In [17]:
import dns.resolver

def check_spf(domain):
    try:
        answers = dns.resolver.resolve(f"{domain}", 'TXT')
        for rdata in answers:
            if 'spf' in str(rdata).lower():
                return True
        return False
    except:
        return False

df['spf_valid'] = df['domain'].apply(check_spf)


In [18]:
check_spf('domain')

False

In [19]:
print(df['Label'].value_counts())


Label
phishing      230
legitimate    224
Name: count, dtype: int64


In [20]:
from sklearn.model_selection import train_test_split

X = df.drop('Label', axis=1)
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Then apply vectorization *only* on X_train['Email_Body']


In [21]:
# Automatically select only numeric columns
numeric_columns = X_train.select_dtypes(include='number').columns


In [22]:
scaler = StandardScaler()
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])


In [23]:
# Example:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_columns),
    ('text', TfidfVectorizer(max_features=100), 'Email_Body') 
])                                                              

# Full pipeline with model
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(X_train, y_train)


In [26]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


In [28]:
from sklearn.ensemble import VotingClassifier
ensemble = VotingClassifier(estimators=[
    ('lr', LogisticRegression()),
    ('rf', RandomForestClassifier()),
    ('xgb', XGBClassifier())
], voting='soft')


In [29]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators': [50, 100], 'max_depth': [None, 10, 20]}
grid = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=5)
grid.fit(X_train[numeric_columns], y_train)


In [30]:
rf = RandomForestClassifier()

In [31]:
rf.fit(X_train[numeric_columns], y_train)

In [32]:
y_pred = rf.predict(X_test[numeric_columns])

In [33]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[21 26]
 [25 19]]
              precision    recall  f1-score   support

  legitimate       0.46      0.45      0.45        47
    phishing       0.42      0.43      0.43        44

    accuracy                           0.44        91
   macro avg       0.44      0.44      0.44        91
weighted avg       0.44      0.44      0.44        91



In [34]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.43956043956043955