LINEAR REGRESSION DETECTION

In [1]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Loading the dataset
dataset = pd.read_csv('D:/Education/PYTHON/ML-v2/Datasets/malicious_phish.csv')

# Extracting domain-based features
dataset['domain_length'] = dataset['url'].apply(lambda x: len(re.findall('/.', x)))
dataset['subdomains'] = dataset['url'].apply(lambda x: x.count('.') - 1)
dataset['has_ip'] = dataset['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

# Extracting path-based features
dataset['path_length'] = dataset['url'].apply(lambda x: len(re.findall('/', x)))
dataset['path_depth'] = dataset['url'].apply(lambda x: x.count('/'))

# Extracting character-based features
special_characters = '@/%/$/=/./_/?/:/~/'
dataset['special_chars_count'] = dataset['url'].apply(lambda x: sum(x.count(char) for char in special_characters))

# Encoding labels to numerical format
label_mapping = {'benign': 0, 'defacement': 1, 'phishing': 1, 'malware': 1}
dataset['type'] = dataset['type'].map(label_mapping)

# Combine features
X = dataset[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count']].values
y = dataset['type'].values

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
logistic_regression = LogisticRegression()

# Fit the model
logistic_regression.fit(X_train, y_train)

# Predictions on the testing set
y_pred = logistic_regression.predict(X_test)

# Evaluation
print("Accuracy:", logistic_regression.score(X_test, y_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.7830219826626433

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.89      0.84     85778
           1       0.73      0.58      0.64     44461

    accuracy                           0.78    130239
   macro avg       0.77      0.73      0.74    130239
weighted avg       0.78      0.78      0.78    130239



1. Encoded the labels to numerical format, where 0 represents benign URLs and 1 represents malicious URLs (combining defacement, phishing, and malware).
2. Using logistic regression, which is suitable for binary classification tasks, to predict whether a URL is malicious or not based on its features.
3. The model is trained and evaluated using the training and testing sets, respectively.
4. Evaluation is performed using accuracy and classification report metrics, including precision, recall, and F1-score.

In [3]:
import re
import numpy as np

# Function to preprocess and extract features from a URL
def extract_features(url):
    domain_length = len(re.findall('/.', url))
    subdomains = url.count('.') - 1
    has_ip = 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', url) else 0
    path_length = len(re.findall('/', url))
    path_depth = url.count('/')
    special_characters = '@/%/$/=/./_/?/:/~/'
    special_chars_count = sum(url.count(char) for char in special_characters)
    
    return [domain_length, subdomains, has_ip, path_length, path_depth, special_chars_count]

# URL to test
new_url = "https://www.kaspersky.co.in/"

# Extract features from the new URL
new_url_features = extract_features(new_url)

# Transform features into a numpy array with the same shape as the training data
new_url_features_array = np.array(new_url_features).reshape(1, -1)

# Predict whether the URL is malicious or benign
prediction = logistic_regression.predict(new_url_features_array)

if prediction == 0:
    print("The URL is classified as benign.")
else:
    print("The URL is classified as malicious.")


The URL is classified as malicious.
