In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, precision_recall_curve, confusion_matrix
import matplotlib.pyplot as plt
#import seaborn as sns

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
split_ratio = 0.8

# Load datasets
spam_test = pd.read_csv('./datasets/SpamHam/test.csv').fillna('')
spam_test['label'] = spam_test['label']


spam_ftrain = pd.read_csv('./datasets/SpamHam/train.csv').fillna('')
spam_ftrain['label'] = spam_ftrain['label']
spamsize = int(split_ratio*spam_ftrain.shape[0])
spam_train=spam_ftrain.iloc[:spamsize]
spam_valid=spam_ftrain.iloc[spamsize:]

urls_test = pd.read_csv('datasets/PhishingURLs/test.csv').fillna('')
urls_test['label'] = urls_test['label'].apply(lambda x: 1 if x == 1 else 0)

urls_ftrain = pd.read_csv('datasets/PhishingURLs/test.csv').fillna('')
urls_ftrain['label'] = urls_ftrain['label'].apply(lambda x: 1 if x == 1 else 0)
urlsize = int(split_ratio*spam_ftrain.shape[0])
urls_train=urls_ftrain.iloc[:urlsize]
urls_valid=urls_ftrain.iloc[urlsize:]

homebrew_data = pd.read_csv('HomebrewDataset.csv').fillna('')
homebrew_data['label'] = homebrew_data['label'].apply(lambda x: 1 if x==1 else 0)

# Split each dataset into training and testing sets
#spam_train, spam_test = train_test_split(spam_full, test_size=0.2, random_state=42)
#urls_train, urls_test = train_test_split(urls_full, test_size=0.2, random_state=42)


# Combine the training data from all datasets
#combined_train = pd.concat([spam_train, urls_train], ignore_index=True)
#combined_valid = pd.concat([spam_valid, urls_valid], ignore_index=True)
# Feature extraction for combined training data
X_train_S = tfidf_vectorizer.fit_transform(spam_train['text'])
y_train_S = spam_train['label']
X_valid_S = tfidf_vectorizer.fit_transform(spam_valid['text'])
y_valid_S = spam_valid['label']

X_train_U = tfidf_vectorizer.fit_transform(urls_train['text'])
y_train_U = spam_train['label']
X_valid_U = tfidf_vectorizer.fit_transform(urls_valid['text'])
y_valid_U = spam_valid['label']
# Train Logistic Regression model
#print(y_train)
#print(X_train)
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train_S, y_train_S)
model.fit(X_train_U, y_train_U)

def evaluate_model(X, y, model, dataset_name):
    predictions = model.predict(X)
    accuracy = accuracy_score(y, predictions)
    #fpr, tpr, _ = roc_curve(y.map({'Ham': 0, 'Spam': 1}), model.predict_proba(X)[:, 1])
    #roc_auc = auc(fpr, tpr)
    #precision, recall, _ = precision_recall_curve(y.map({'Ham': 0, 'Spam': 1}), model.predict_proba(X)[:, 1])
    #pr_auc = auc(recall, precision)
    conf_matrix = confusion_matrix(y, predictions)

    # Plotting
    #plt.figure(figsize=(10, 5))
    #plt.subplot(1, 2, 1)
    #plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    #plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    #plt.xlim([0.0, 1.0])
    #plt.ylim([0.0, 1.05])
    #plt.xlabel('False Positive Rate')
    #plt.ylabel('True Positive Rate')
    #plt.title(f'ROC Curve for {dataset_name}')
    #plt.legend(loc="lower right")

    #plt.subplot(1, 2, 2)
    #plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve (area = %0.2f)' % pr_auc)
    #plt.xlim([0.0, 1.0])
    #plt.ylim([0.0, 1.05])
    #plt.xlabel('Recall')
    #plt.ylabel('Precision')
    #plt.title(f'Precision-Recall Curve for {dataset_name}')
    #plt.legend(loc="lower left")
    #plt.show()

    #sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
    #plt.title(f'Confusion Matrix for {dataset_name}')
    #plt.xlabel('Predicted Label')
    #plt.ylabel('True Label')
    #plt.show()

    print(f"\n{dataset_name} Accuracy:", accuracy)
    print(f"{dataset_name} Classification Report:\n", classification_report(y, predictions))

X_spam_valid = tfidf_vectorizer.transform(spam_valid['text'])
Y_spam_valid = spam_valid['label']
#print(Y_spam_valid)
evaluate_model(X_spam_valid, Y_spam_valid, model, "Spam Validation Set")

X_urls_valid = tfidf_vectorizer.transform(urls_valid['text'])
y_urls_valid = urls_valid['label']
print(y_urls_valid)
evaluate_model(X_urls_valid, y_urls_valid, model, "URL Validation Set")



# Evaluate the models
X_spam_test = tfidf_vectorizer.transform(spam_test['text'])
y_spam_test = spam_test['label']
evaluate_model(X_spam_test, y_spam_test, model, "Spam Test Data")

X_urls_test = tfidf_vectorizer.transform(urls_test['text'])
y_urls_test = urls_test['label']
evaluate_model(X_urls_test, y_urls_test, model, "URLs Test Data")

X_homebrew_test = tfidf_vectorizer.transform(homebrew_data['text'])
y_homebrew_test = homebrew_data['label']
evaluate_model(X_homebrew_test, y_homebrew_test, model, "Homebrew Test Data")



Spam Validation Set Accuracy: 0.4951545745974428
Spam Validation Set Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.50      0.51     14411
           1       0.46      0.49      0.48     12728

    accuracy                           0.50     27139
   macro avg       0.49      0.49      0.49     27139
weighted avg       0.50      0.50      0.50     27139

108556    0
108557    1
108558    0
108559    1
108560    1
         ..
159995    1
159996    1
159997    0
159998    1
159999    0
Name: label, Length: 51444, dtype: int64

URL Validation Set Accuracy: 0.49739522587668145
URL Validation Set Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.36      0.42     25631
           1       0.50      0.63      0.56     25813

    accuracy                           0.50     51444
   macro avg       0.50      0.50      0.49     51444
weighted avg       0.50      0.50      0.