In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import joblib

# Load your dataset
data = pd.read_csv('data/malicious_phish.csv')
data = data.dropna(subset=['url'])
data['url'] = data['url'].astype(str)

# Text preprocessing function for URLs
def preprocess_url(url):
    url = url.lower()
    url = ''.join([char for char in url if char not in string.punctuation and not char.isdigit()])
    tokens = word_tokenize(url)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    url = ' '.join(tokens)
    return url

# Apply text preprocessing to the 'url' column
data['url'] = data['url'].apply(preprocess_url)

# Create a TF-IDF vectorizer to convert text data into feature vectors
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(data['url'].values)
y = data['type'].values
# Save the trained SVM classifier
joblib.dump(vectorizer, 'malicious_url_vectorizer5.sav')

# Split the dataset into training and testing sets (adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the batch size for mini-batch training
batch_size = 64  # You can adjust this based on your available memory and data size

# Initialize and train your SVM classifier (or experiment with other classifiers)
svm_classifier = SVC(probability=True, kernel='linear')

# Mini-batch training
for epoch in range(epochs):  # Define the number of epochs
    for batch_start in range(0, len(X_train), batch_size):
        batch_end = batch_start + batch_size
        X_batch = X_train[batch_start:batch_end]
        y_batch = y_train[batch_start:batch_end]
        
        svm_classifier.partial_fit(X_batch, y_batch, classes=data['type'].unique())

# Make predictions on the test set
test_predictions = svm_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, test_predictions)
precision = precision_score(y_test, test_predictions, average='macro', labels=data['type'].unique())
recall = recall_score(y_test, test_predictions, average='macro', labels=data['type'].unique())
f1 = f1_score(y_test, test_predictions, average='macro', labels=data['type'].unique())

# Print evaluation metrics
print("SVM Classifier:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Plot confusion matrix
confusion = confusion_matrix(y_test, test_predictions, labels=data['type'].unique())
# Customize the plot as needed
# Save the trained SVM classifier
joblib.dump(svm_classifier, 'malicious_url_model5.sav')
