<a href="https://colab.research.google.com/github/shardul2512/AI-Interview-coach/blob/main/malicious%20url%20detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import pickle
import re
import os
from urllib.parse import urlparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # We can replace this with our best model
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sid321axn/malicious-urls-dataset")

print("Path to dataset files:", path)
# If you haven't trained a model yet, this will train one automatically
def train_model_if_needed():
    if not os.path.exists('best_malicious_url_model.pkl') or not os.path.exists('url_vectorizer.pkl') or not os.path.exists('url_scaler.pkl'):
        print("No trained model found. Training a new model...")
        # Load sample dataset - replace with your actual dataset file
        try:
            df = pd.read_csv('malicious_urls.csv')
        except FileNotFoundError:
            print("Sample dataset not found. Creating a minimal dataset for demonstration.")
            # Create a minimal sample dataset for demonstration
            urls = [
                'google.com', 'facebook.com', 'twitter.com', 'microsoft.com',  # Safe examples
                'free-v-bucks.com', 'get-free-bitcoins.net', 'login-secure-paypal.com', 'verify-account-apple.com'  # Malicious examples
            ]
            labels = [0, 0, 0, 0, 1, 1, 1, 1]  # 0 for safe, 1 for malicious
            df = pd.DataFrame({'url': urls, 'label': labels})
            df.to_csv('malicious_urls.csv', index=False)

        # Feature extraction
        X, y, vectorizer, scaler = extract_features_for_training(df)

        # Train a simple Random Forest model
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X, y)

        # Save the model and preprocessing components
        with open('best_malicious_url_model.pkl', 'wb') as f:
            pickle.dump(model, f)
        with open('url_vectorizer.pkl', 'wb') as f:
            pickle.dump(vectorizer, f)
        with open('url_scaler.pkl', 'wb') as f:
            pickle.dump(scaler, f)

        print("Model trained and saved successfully.")
    else:
        print("Trained model found. Ready to analyze URLs.")

# Extract features for training
def extract_features_for_training(df):
    # TF-IDF on raw URLs
    vectorizer = TfidfVectorizer(max_features=5000)
    url_features = vectorizer.fit_transform(df['url'])

    # Engineer additional features
    additional_features = engineer_url_features(df)

    # Scale numerical features
    scaler = StandardScaler()
    additional_features_scaled = scaler.fit_transform(additional_features)

    # Combine features
    X = np.hstack((url_features.toarray(), additional_features_scaled))
    y = df['label'].values

    return X, y, vectorizer, scaler

# Engineer features for a DataFrame of URLs
def engineer_url_features(df):
    df['url_length'] = df['url'].apply(len)
    df['num_dots'] = df['url'].apply(lambda x: x.count('.'))
    df['num_digits'] = df['url'].apply(lambda x: sum(c.isdigit() for c in x))
    df['num_special_chars'] = df['url'].apply(lambda x: sum(not c.isalnum() and not c.isspace() for c in x))
    df['has_https'] = df['url'].apply(lambda x: 1 if 'https://' in x else 0)
    df['has_http'] = df['url'].apply(lambda x: 1 if 'http://' in x and 'https://' not in x else 0)
    df['has_www'] = df['url'].apply(lambda x: 1 if 'www.' in x else 0)
    df['num_hyphens'] = df['url'].apply(lambda x: x.count('-'))
    df['num_underscores'] = df['url'].apply(lambda x: x.count('_'))
    df['num_query_params'] = df['url'].apply(count_query_params)
    df['domain_length'] = df['url'].apply(lambda x: len(extract_domain(x)))

    return df[['url_length', 'num_dots', 'num_digits', 'num_special_chars',
              'has_https', 'has_http', 'has_www', 'num_hyphens', 'num_underscores',
              'num_query_params', 'domain_length']].values

# Count query parameters in URL
def count_query_params(url):
    try:
        query = urlparse(url).query if urlparse(url).scheme else urlparse('http://' + url).query
        return len(query.split('&')) if query else 0
    except:
        return 0

# Extract domain from URL
def extract_domain(url):
    try:
        parsed = urlparse(url).netloc if urlparse(url).scheme else urlparse('http://' + url).netloc
        return parsed
    except:
        return url

# Extract features for a single URL
def extract_features_for_url(url, vectorizer, scaler):
    # Create a DataFrame with a single URL
    df = pd.DataFrame({'url': [url]})

    # TF-IDF features
    url_features = vectorizer.transform([url])

    # Additional features
    additional_features = engineer_url_features(df)
    additional_features_scaled = scaler.transform(additional_features)

    # Combine features
    X = np.hstack((url_features.toarray(), additional_features_scaled))

    return X

# Normalize URL for analysis
def normalize_url(url):
    # Add http:// if no protocol specified
    if not re.match(r'^https?://', url):
        url = 'http://' + url
    # Remove trailing slash
    url = url.rstrip('/')
    return url

# Analyze URL and return result
def analyze_url(url):
    # Normalize the URL
    url = normalize_url(url)

    # Load the trained model and preprocessing components
    with open('best_malicious_url_model.pkl', 'rb') as f:
        model = pickle.load(f)
    with open('url_vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    with open('url_scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)

    # Extract features
    features = extract_features_for_url(url, vectorizer, scaler)

    # Make prediction
    prediction = model.predict(features)[0]
    probability = model.predict_proba(features)[0][1]

    # Determine risk level
    if probability < 0.3:
        risk_level = "Low Risk"
    elif probability < 0.7:
        risk_level = "Medium Risk"
    else:
        risk_level = "High Risk"

    # Return result
    result = {
        'url': url,
        'is_malicious': bool(prediction),
        'probability': float(probability),
        'risk_level': risk_level
    }

    return result

# Simple command-line interface
def main():
    print("=== Malicious URL Detector ===")
    print("This tool analyzes URLs to detect potentially malicious websites.")

    # Check if we need to train a model first
    train_model_if_needed()

    while True:
        # Get URL input
        url = input("\nEnter a URL to analyze (or 'exit' to quit): ")

        if url.lower() == 'exit':
            break

        # Analyze the URL
        try:
            result = analyze_url(url)

            # Display results
            print("\nAnalysis Results:")
            print(f"URL: {result['url']}")
            print(f"Classification: {'MALICIOUS' if result['is_malicious'] else 'BENIGN'}")
            print(f"Confidence: {result['probability']*100:.2f}%")
            print(f"Risk Level: {result['risk_level']}")

            # Additional security advice
            if result['is_malicious']:
                print("\nWARNING: This URL appears to be malicious. Do not visit this website.")
                print("It may be used for phishing, malware distribution, or other malicious activities.")
            else:
                print("\nThis URL appears to be benign, but always exercise caution when visiting unknown websites.")

        except Exception as e:
            print(f"Error analyzing URL: {str(e)}")

    print("Thank you for using the Malicious URL Detector.")

if __name__ == "__main__":
    main()