In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [31]:
from bs4 import BeautifulSoup
import re

In [2]:
# Load your dataset (you need to have a CSV file containing labeled URLs)
dataset = pd.read_csv('phishing_site_urls.csv')

In [3]:
# Preprocessing: Convert labels to numerical values (0 for 'good', 1 for 'bad')
label_mapping = {'good': 0, 'bad': 1}
dataset['Label'] = dataset['Label'].map(label_mapping)

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset['URL'], dataset['Label'], test_size=0.2, random_state=42)

In [5]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
# Train a Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train_tfidf, y_train)

In [11]:
# Make predictions on the test set
y_pred = svm_classifier.predict(X_test_tfidf)

In [23]:
import joblib

# Save the trained SVM classifier and vectorizer to files
joblib.dump(svm_classifier, 'svm_classifier_model.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']

In [24]:
# Load the saved SVM classifier and TF-IDF vectorizer
svm_classifier = joblib.load('svm_classifier_model.joblib')
vectorizer = joblib.load('tfidf_vectorizer.joblib')

In [25]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9726116920152091


In [26]:
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['good', 'bad']))

Classification Report:
              precision    recall  f1-score   support

        good       0.97      0.97      0.97      7223
         bad       0.98      0.98      0.98      9609

    accuracy                           0.97     16832
   macro avg       0.97      0.97      0.97     16832
weighted avg       0.97      0.97      0.97     16832



In [20]:
# Learning Curve plotting
def plot_learning_curve(estimator, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure()
    plt.title("Learning Curve")
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training Examples")
    plt.ylabel("Score")
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-Validation Score")

    plt.legend(loc="best")
    return plt


In [None]:
plot_learning_curve(svm_classifier, X_train_tfidf, y_train, cv=3, n_jobs=-1)
plt.show()

In [32]:
def clean_text(text):
    # Function to clean the URL text by removing unwanted characters and converting to lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\W+', ' ', text.lower())
    return text.strip()

def extract_text_from_url(url):
    # Function to fetch text content from a given URL
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text(separator=' ')
        return text
    except:
        return None

In [33]:
def predict_url_category(url):
    # Function to predict whether the URL is "good" or "bad"
    cleaned_url_text = clean_text(url)
    url_text = extract_text_from_url(url)
    if url_text is not None:
        cleaned_text = clean_text(url_text)
        tfidf_features = vectorizer.transform([cleaned_text])
        prediction = svm_classifier.predict(tfidf_features)
        if prediction[0] == 0:
            return "good"
        elif prediction[0] == 1:
            return "bad"
    return "Invalid URL or Unable to Process"


In [34]:
# Example usage
input_url = "https://www.example.com"
result = predict_url_category(input_url)
print("URL Category:", result)

URL Category: Invalid URL or Unable to Process
