In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
data = pd.read_csv('phish.csv') # Loading the dataset
data = data.dropna(subset=['text', 'PHISH-GT']) # question

data.head()

Unnamed: 0.1,Unnamed: 0,text,PHISH-GT,billing,account,generic,attachment,typos,click-link,grammar,login,urgency,phish,unsolicited,work
0,42,Please take a look at the attached and give me...,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,44,"Rod,\nI wanted to forward this to you. Arthur...",0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,45,\nI confirmed that since Enron does not deviat...,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,46,Good afternoon.\n\nI wanted to forward you one...,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,47,Kim:\n\nHere is a summary of the structure we ...,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [8]:
def shuffling(texts, labels):
    np.random.seed(1111)
    ind_shuffled = np.random.permutation(len(texts))
    texts_shuffled = texts[ind_shuffled]
    labels_shuffled = labels[ind_shuffled]
    return texts_shuffled, labels_shuffled

def partitioning(texts_shuffled, labels_shuffled):
    num_test = int(len(texts_shuffled) * 0.2)  # 20% for testing
    texts_test = texts_shuffled[:num_test]
    labels_test = labels_shuffled[:num_test]
    texts_train = texts_shuffled[num_test:]
    labels_train = labels_shuffled[num_test:]
    return texts_train, labels_train, texts_test, labels_test

In [11]:
# Shuffle and partition data
texts_shuffled, labels_shuffled = shuffling(data['text'].values, data['PHISH-GT'].values)
texts_train, labels_train, texts_test, labels_test = partitioning(texts_shuffled, labels_shuffled)


In [15]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.7, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(texts_train)
X_test_tfidf = tfidf_vectorizer.transform(texts_test)

# logistic
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, labels_train)

train_predictions = model.predict(X_train_tfidf)
test_predictions = model.predict(X_test_tfidf)

training_accuracy = accuracy_score(labels_train, train_predictions)
test_accuracy = accuracy_score(labels_test, test_predictions)

print(f"Training Accuracy: {training_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

Training Accuracy: 0.9909404659188956
Test Accuracy: 0.9861830742659758
