In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

try:
    df = pd.read_csv('phishing_email.csv')
except FileNotFoundError:
    print("Error: Make sure the CSV file is in the same directory as your script.")
    exit()


In [None]:
df.dropna(subset=['label'], inplace=True)

In [None]:
print(df.head())

                                       text_combined  label
0  hpl nom may 25 2001 see attached file hplno 52...      0
1  nom actual vols 24 th forwarded sabrae zajac h...      0
2  enron actuals march 30 april 1 201 estimated a...      0
3  hpl nom may 30 2001 see attached file hplno 53...      0
4  hpl nom june 1 2001 see attached file hplno 60...      0


In [None]:
df.info()
print("\n" + "="*40 + "\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82486 entries, 0 to 82485
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text_combined  82486 non-null  object
 1   label          82486 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.3+ MB




In [None]:

df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,42891
0,39595


In [None]:
X = df['text_combined']
y = df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(f"Data successfully loaded and split.")
print(f"Training set size: {len(X_train)} emails")
print(f"Testing set size: {len(X_test)} emails")


Data successfully loaded and split.
Training set size: 65988 emails
Testing set size: 16498 emails


In [None]:
import pandas as pd
import re
import string


suspicious_keywords = [
    "verify", "password", "urgent", "account", "login", "bank",
    "limited", "security", "update", "confirm"
]

def engineer_features(email_text):


    if not isinstance(email_text, str):
        email_text = ""

    features = {}


    features['word_count'] = len(email_text.split())


    features['char_count'] = len(email_text)


    features['keyword_count'] = sum(1 for keyword in suspicious_keywords if keyword in email_text.lower())


    features['link_count'] = len(re.findall(r'http[s]?://', email_text))


    uppercase_chars = sum(1 for char in email_text if char.isupper())
    features['uppercase_ratio'] = uppercase_chars / features['char_count'] if features['char_count'] > 0 else 0


    features['punctuation_count'] = sum(1 for char in email_text if char in string.punctuation)

    features['number_count'] = sum(1 for char in email_text if char.isdigit())

    return features


sample_email_text = X_train.iloc[0]
extracted_features = engineer_features(sample_email_text)

print("--- Features extracted from a sample email ---")
print(extracted_features)

--- Features extracted from a sample email ---
{'word_count': 42, 'char_count': 286, 'keyword_count': 0, 'link_count': 0, 'uppercase_ratio': 0.0, 'punctuation_count': 0, 'number_count': 28}


In [None]:

print("Applying feature engineering to the training set...")
X_train_features_list = X_train.apply(engineer_features)
X_train_features = pd.DataFrame(X_train_features_list.tolist(), index=X_train.index)

print("Applying feature engineering to the testing set...")
X_test_features_list = X_test.apply(engineer_features)
X_test_features = pd.DataFrame(X_test_features_list.tolist(), index=X_test.index)

print("\n--- First 5 rows of your new engineered training features ---")
print(X_train_features.head())

print("\nFeature engineering complete!")

Applying feature engineering to the training set...
Applying feature engineering to the testing set...

--- First 5 rows of your new engineered training features ---
       word_count  char_count  keyword_count  link_count  uppercase_ratio  \
36471          42         286              0           0              0.0   
7743          400        2504              1           0              0.0   
43256          21         168              0           0              0.0   
52280          16         120              0           0              0.0   
22663         490        3522              0           0              0.0   

       punctuation_count  number_count  
36471                  0            28  
7743                  13            99  
43256                  0            16  
52280                  0            16  
22663                  0            22  

Feature engineering complete!


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import hstack
import numpy as np
from sklearn.model_selection import GridSearchCV


print("Vectorizing email text with TF-IDF...")

vectorizer = TfidfVectorizer(max_features=2000)


X_train_text = vectorizer.fit_transform(X_train)
X_test_text = vectorizer.transform(X_test)



print("Combining text features with engineered features...")
X_train_final = hstack([X_train_text, X_train_features.values])
X_test_final = hstack([X_test_text, X_test_features.values])

print("\n--- Training Baseline Model: Logistic Regression ---")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_final, y_train)

lr_predictions = lr_model.predict(X_test_final)

#Define the parameters you want to search
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20, None],
    'min_samples_leaf': [1, 2, 4]
}

#Set up the Grid Search with Cross-Validation (cv=3)
# split the data into 3 folds
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

print("\n" + "="*40 + "\n")
print("--- Starting Hyperparameter Tuning with GridSearchCV ---")
#Fit the grid search to the data
grid_search.fit(X_train_final, y_train)

print(f"\nBest parameters found: {grid_search.best_params_}")
best_rf_model = grid_search.best_estimator_


rf_predictions = best_rf_model.predict(X_test_final)

print("\n--- Evaluation: Tuned Random Forest ---")
print(classification_report(y_test, rf_predictions, target_names=['Safe Email', 'Phishing']))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))
print("\n" + "="*40 + "\n")
print("--- Top 15 Most Important Features for Random Forest Model ---")
text_feature_names = vectorizer.get_feature_names_out()
engineered_feature_names = X_train_features.columns.tolist()
all_feature_names = np.concatenate([text_feature_names, engineered_feature_names])


importances = best_rf_model.feature_importances_
indices = np.argsort(importances)[::-1]


for i in range(15):
    print(f"{i+1}. Feature: {all_feature_names[indices[i]]} (Importance: {importances[indices[i]]:.4f})")

Vectorizing email text with TF-IDF...
Combining text features with engineered features...

--- Training Baseline Model: Logistic Regression ---


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(




--- Starting Hyperparameter Tuning with GridSearchCV ---
Fitting 3 folds for each of 18 candidates, totalling 54 fits

Best parameters found: {'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 100}

--- Evaluation: Tuned Random Forest ---
              precision    recall  f1-score   support

  Safe Email       0.99      0.98      0.98      7935
    Phishing       0.99      0.99      0.99      8563

    accuracy                           0.99     16498
   macro avg       0.99      0.99      0.99     16498
weighted avg       0.99      0.99      0.99     16498

Confusion Matrix:
[[7808  127]
 [ 114 8449]]


--- Top 15 Most Important Features for Random Forest Model ---
1. Feature: wrote (Importance: 0.0300)
2. Feature: aug (Importance: 0.0262)
3. Feature: number_count (Importance: 0.0260)
4. Feature: 2008 (Importance: 0.0237)
5. Feature: enron (Importance: 0.0229)
6. Feature: char_count (Importance: 0.0203)
7. Feature: word_count (Importance: 0.0140)
8. Feature: thanks (Importa

In [None]:
import joblib
joblib.dump(best_rf_model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
print("Model and vectorizer saved as model.pkl and vectorizer.pkl")


Model and vectorizer saved as model.pkl and vectorizer.pkl
