My Second Notebook

Code #2

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import re


In [2]:

# Load the dataset
dataset = pd.read_csv('D:/Education/PYTHON/ML-v2/Datasets/malicious_phish.csv')

In [4]:
# Preprocessing


In [3]:
# Extracting domain-based features
dataset['domain_length'] = dataset['url'].apply(lambda x: len(re.findall('/.', x)))
dataset['subdomains'] = dataset['url'].apply(lambda x: x.count('.') - 1)
dataset['tld'] = dataset['url'].apply(lambda x: x.split('.')[-1])
dataset['has_ip'] = dataset['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

# Extracting path-based features
dataset['path_length'] = dataset['url'].apply(lambda x: len(re.findall('/', x)))
dataset['path_depth'] = dataset['url'].apply(lambda x: x.count('/'))

# Extracting character-based features
special_characters = '@/%/$/=/./_/?/:/~/'
dataset['special_chars_count'] = dataset['url'].apply(lambda x: sum(x.count(char) for char in special_characters))

# Encoding categorical labels to numerical format
label_mapping = {'benign': 0, 'defacement': 1, 'phishing': 2, 'malware': 3}
dataset['type'] = dataset['type'].map(label_mapping)

# Feature Extraction using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_text_features = tfidf_vectorizer.fit_transform(dataset['url'])

# Combining text features with extracted features
X_numeric_features = dataset[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count']].values
X = pd.concat([pd.DataFrame(X_text_features.toarray()), pd.DataFrame(X_numeric_features)], axis=1)
y = dataset['type']



In [6]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predictions on the testing set
y_pred = rf_classifier.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9766045500963613

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     85778
           1       0.98      0.99      0.99     19104
           2       0.95      0.90      0.92     18836
           3       0.99      0.95      0.97      6521

    accuracy                           0.98    130239
   macro avg       0.97      0.96      0.97    130239
weighted avg       0.98      0.98      0.98    130239



Code #3

In [4]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Loading the dataset
dataset = pd.read_csv('D:/Education/PYTHON/ML-v2/Datasets/malicious_phish.csv')

# Extracting domain-based features
dataset['domain_length'] = dataset['url'].apply(lambda x: len(re.findall('/.', x)))
dataset['subdomains'] = dataset['url'].apply(lambda x: x.count('.') - 1)
dataset['tld'] = dataset['url'].apply(lambda x: x.split('.')[-1])
dataset['has_ip'] = dataset['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

# Extracting path-based features
dataset['path_length'] = dataset['url'].apply(lambda x: len(re.findall('/', x)))
dataset['path_depth'] = dataset['url'].apply(lambda x: x.count('/'))

# Extracting character-based features
special_characters = '@/%/$'
dataset['special_chars_count'] = dataset['url'].apply(lambda x: sum(x.count(char) for char in special_characters))

# Encoding categorical labels to numerical format
label_mapping = {'benign': 0, 'defacement': 1, 'phishing': 2, 'malware': 3}
dataset['type'] = dataset['type'].map(label_mapping)

# Feature Extraction using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_text_features = tfidf_vectorizer.fit_transform(dataset['url'])

# Combining text features with extracted features
X_numeric_features = dataset[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count']].values
X = pd.concat([pd.DataFrame(X_text_features.toarray()), pd.DataFrame(X_numeric_features)], axis=1)
y = dataset['type']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predictions on the testing set
y_pred = rf_classifier.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Example new URLs
new_urls = pd.DataFrame({'url': ['http://www.google.com', 'https://example.com', 'http://www.phishing-site.com']})

# Define a function to extract features from new URLs
def extract_features(new_urls):
    new_urls['domain_length'] = new_urls['url'].apply(lambda x: len(re.findall('/.', x)))
    new_urls['subdomains'] = new_urls['url'].apply(lambda x: x.count('.') - 1)
    new_urls['has_ip'] = new_urls['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)
    new_urls['path_length'] = new_urls['url'].apply(lambda x: len(re.findall('/', x)))
    new_urls['path_depth'] = new_urls['url'].apply(lambda x: x.count('/'))
    special_characters = '@/%/$'
    new_urls['special_chars_count'] = new_urls['url'].apply(lambda x: sum(x.count(char) for char in special_characters))
    return new_urls

# Extract features from new URLs
new_urls = extract_features(new_urls)

# TF-IDF vectorization
X_text_features_new = tfidf_vectorizer.transform(new_urls['url'])

# Combine text features with extracted features
X_numeric_features_new = new_urls[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count']].values
X_new = pd.concat([pd.DataFrame(X_text_features_new.toarray()), pd.DataFrame(X_numeric_features_new)], axis=1)

# Predictions on the new URLs
y_pred_new = rf_classifier.predict(X_new)

# Decode numerical labels to original categories
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
predicted_labels = [reverse_label_mapping[label] for label in y_pred_new]

print("\nPredictions for new URLs:")
for url, pred_label in zip(new_urls['url'], predicted_labels):
    print(f"URL: {url} - Predicted Label: {pred_label}")


Accuracy: 0.9756678107172199

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     85778
           1       0.98      0.99      0.99     19104
           2       0.94      0.90      0.92     18836
           3       0.99      0.95      0.97      6521

    accuracy                           0.98    130239
   macro avg       0.97      0.96      0.97    130239
weighted avg       0.98      0.98      0.98    130239


Predictions for new URLs:
URL: http://www.google.com - Predicted Label: phishing
URL: https://example.com - Predicted Label: phishing
URL: http://www.phishing-site.com - Predicted Label: phishing


Code #4 (SVMs)

In [5]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Loading the dataset
dataset = pd.read_csv('D:/Education/PYTHON/ML-v2/Datasets/malicious_phish.csv')

# Extracting domain-based features
dataset['domain_length'] = dataset['url'].apply(lambda x: len(re.findall('/.', x)))
dataset['subdomains'] = dataset['url'].apply(lambda x: x.count('.') - 1)
dataset['tld'] = dataset['url'].apply(lambda x: x.split('.')[-1])
dataset['has_ip'] = dataset['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

# Extracting path-based features
dataset['path_length'] = dataset['url'].apply(lambda x: len(re.findall('/', x)))
dataset['path_depth'] = dataset['url'].apply(lambda x: x.count('/'))

# Extracting character-based features
special_characters = '@/%/$/=/./_/?/:/~/'
dataset['special_chars_count'] = dataset['url'].apply(lambda x: sum(x.count(char) for char in special_characters))

# Encoding categorical labels to numerical format
label_mapping = {'benign': 0, 'defacement': 1, 'phishing': 2, 'malware': 3}
dataset['type'] = dataset['type'].map(label_mapping)

# Feature Extraction using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_text_features = tfidf_vectorizer.fit_transform(dataset['url'])

# Combining text features with extracted features
X_numeric_features = dataset[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count']].values
X = pd.concat([pd.DataFrame(X_text_features.toarray()), pd.DataFrame(X_numeric_features)], axis=1)
y = dataset['type']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predictions on the testing set
y_pred = rf_classifier.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.9766045500963613

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     85778
           1       0.98      0.99      0.99     19104
           2       0.95      0.90      0.92     18836
           3       0.99      0.95      0.97      6521

    accuracy                           0.98    130239
   macro avg       0.97      0.96      0.97    130239
weighted avg       0.98      0.98      0.98    130239



In [8]:
# Example new URLs
new_urls = pd.DataFrame({'url': ['https://www.kaspersky.co.in/', 'https://us.norton.com/', 'https://t.me/+yK4o5P3HqhxhYzU8']})

# Define a function to extract features from new URLs
def extract_features(new_urls):
    new_urls['domain_length'] = new_urls['url'].apply(lambda x: len(re.findall('/.', x)))
    new_urls['subdomains'] = new_urls['url'].apply(lambda x: x.count('.') - 1)
    new_urls['has_ip'] = new_urls['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)
    new_urls['path_length'] = new_urls['url'].apply(lambda x: len(re.findall('/', x)))
    new_urls['path_depth'] = new_urls['url'].apply(lambda x: x.count('/'))
    special_characters = '@/%/$/=/./_/?/:/~/'
    new_urls['special_chars_count'] = new_urls['url'].apply(lambda x: sum(x.count(char) for char in special_characters))
    return new_urls




In [9]:
# Extract features from new URLs
new_urls = extract_features(new_urls)

# TF-IDF vectorization
X_text_features_new = tfidf_vectorizer.transform(new_urls['url'])

# Combine text features with extracted features
X_numeric_features_new = new_urls[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count']].values
X_new = pd.concat([pd.DataFrame(X_text_features_new.toarray()), pd.DataFrame(X_numeric_features_new)], axis=1)

# Predictions on the new URLs
y_pred_new = rf_classifier.predict(X_new)

# Decode numerical labels to original categories
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
predicted_labels = [reverse_label_mapping[label] for label in y_pred_new]

print("\nPredictions for new URLs:")
for url, pred_label in zip(new_urls['url'], predicted_labels):
    print(f"URL: {url} - Predicted Label: {pred_label}")


Predictions for new URLs:
URL: https://www.kaspersky.co.in/ - Predicted Label: phishing
URL: https://us.norton.com/ - Predicted Label: phishing
URL: https://t.me/+yK4o5P3HqhxhYzU8 - Predicted Label: phishing
