In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv('phish_url_label_train.csv')

vectorizer = CountVectorizer(analyzer = 'char_wb',
                               ngram_range=(3, 4),
                               max_features=10000,
                               lowercase=False)

X = vectorizer.fit_transform(df['url'])
y = df['label'].values

In [3]:
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

seed = 42
model_id = input("Enter unique ID to label your saved model and vectorizer:")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print("Model Validation Results:\n")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall: .4f}")
    print(f"F1-score: {f1: .4f}\n")

lr_model = LogisticRegression(               
    solver='saga',                                     
    max_iter=10000,               
    random_state=seed,
    n_jobs=-1
)

lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)

evaluate_model(lr_model, X_test, y_test)

joblib.dump(lr_model, f"logreg_model_{model_id}")
joblib.dump(vectorizer, f"vectorizer_{model_id}")

Model Validation Results:

Accuracy: 0.9632
Precision: 0.9503
Recall:  0.9191
F1-score:  0.9344



['vectorizer_R3']