My First Notebook

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import re
import numpy as np


# Loading my dataset

In [6]:
# Loading the dataset

dataset = pd.read_csv('D:/Education/PYTHON/ML-v2/Datasets/malicious_phish.csv')


In [None]:
# Preprocessing

In [7]:
# Extracting domain-based features
dataset['domain_length'] = dataset['url'].apply(lambda x: len(re.findall('/.', x)))
dataset['subdomains'] = dataset['url'].apply(lambda x: x.count('.') - 1)
dataset['tld'] = dataset['url'].apply(lambda x: x.split('.')[-1])
dataset['has_ip'] = dataset['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

# Extracting path-based features
dataset['path_length'] = dataset['url'].apply(lambda x: len(re.findall('/', x)))
dataset['path_depth'] = dataset['url'].apply(lambda x: x.count('/'))

# Extracting character-based features
special_characters = '@/%/$'
dataset['special_chars_count'] = dataset['url'].apply(lambda x: sum(x.count(char) for char in special_characters))

# Encoding categorical labels to numerical format
label_mapping = {'benign': 0, 'defacement': 1, 'phishing': 2, 'malware': 3}
dataset['type'] = dataset['type'].map(label_mapping)

# Feature Extraction using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_text_features = tfidf_vectorizer.fit_transform(dataset['url'])

# Combining text features with extracted features
X_numeric_features = dataset[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count']].values
X = pd.concat([pd.DataFrame(X_text_features.toarray()), pd.DataFrame(X_numeric_features)], axis=1)
y = dataset['type']




In [13]:

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)



In [15]:
# Train the classifier
rf_classifier.fit(X_train, y_train)



In [16]:

# Predictions on the testing set
y_pred = rf_classifier.predict(X_test)



In [17]:
# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9756678107172199

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     85778
           1       0.98      0.99      0.99     19104
           2       0.94      0.90      0.92     18836
           3       0.99      0.95      0.97      6521

    accuracy                           0.98    130239
   macro avg       0.97      0.96      0.97    130239
weighted avg       0.98      0.98      0.98    130239



TESTING 01

In [23]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a function to extract features from new URLs
def extract_features(new_urls):
    new_urls['domain_length'] = new_urls['url'].apply(lambda x: len(re.findall('/.', x)))
    new_urls['subdomains'] = new_urls['url'].apply(lambda x: x.count('.') - 1)
    new_urls['has_ip'] = new_urls['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)
    new_urls['path_length'] = new_urls['url'].apply(lambda x: len(re.findall('/', x)))
    new_urls['path_depth'] = new_urls['url'].apply(lambda x: x.count('/'))
    special_characters = '@/%/$'
    new_urls['special_chars_count'] = new_urls['url'].apply(lambda x: sum(x.count(char) for char in special_characters))
    return new_urls



In [24]:
# Example new URLs
new_urls = pd.DataFrame({'url': ['http://www.google.com', 'https://example.com', 'http://www.phishing-site.com']})



In [25]:
# Extract features from new URLs
new_urls = extract_features(new_urls)


In [26]:
# TF-IDF vectorization
X_text_features_new = tfidf_vectorizer.transform(new_urls['url'])



In [27]:
# Combine text features with extracted features
X_numeric_features_new = new_urls[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count']].values
X_new = pd.concat([pd.DataFrame(X_text_features_new.toarray()), pd.DataFrame(X_numeric_features_new)], axis=1)


In [32]:

# Predictions on the new URLs
y_pred_new = rf_classifier.predict(X_new)

NameError: name 'rf_classifier' is not defined

In [29]:
# Decode numerical labels to original categories
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
predicted_labels = [reverse_label_mapping[label] for label in y_pred_new]

NameError: name 'y_pred_new' is not defined

In [31]:
print("Predictions for new URLs:")
for url, pred_label in zip(new_urls['url'], predicted_labels):
    print(f"URL: {url} - Predicted Label: {pred_label}")

Predictions for new URLs:


NameError: name 'predicted_labels' is not defined

TESTING 02

In [20]:
# Function to predict the category of a new URL
def predict_url_category(url):
    # Preprocess the URL
    url = url.strip().lower()
    url = re.sub(r'[^a-z0-9/.-]', '', url) # remove non-alphanumeric characters except for '/', '.', '-'

    # Extract domain-based features
    domain_length = len(re.findall('/.', url))
    subdomains = url.count('.') - 1
    tld = url.split('.')[-1]
    has_ip = int(re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', url) is not None)

    # Extract path-based features
    path_length = len(re.findall('/', url))
    path_depth = url.count('/')

    # Extract character-based features
    special_characters = '@/%/$'
    special_chars_count = sum(url.count(char) for char in special_characters)

    # Combine the extracted features with the TF-IDF vectorizer
    X_text_features = tfidf_vectorizer.transform([url])
    X_numeric_features = np.array([[domain_length, subdomains, has_ip, path_length, path_depth, special_chars_count]]).T
    X = pd.concat([pd.DataFrame(X_text_features.toarray()), pd.DataFrame(X_numeric_features)], axis=1)

    # Predict the category using the trained model
    y_pred = rf_classifier.predict(X)

    return y_pred[0]



In [22]:

# Example usage
url = "https://l.wl.co/l?u=https://clcr.me/nOkXmN?fbclid=o8mwcVP6SDWvficRZbUn38dAKR_FOR_YOU_MY_PH0T0S_AND_V1D30S_FREE_COME_HERE_GUYS_LETS_MEET_HERE_ONLY"
category = predict_url_category(url)
print("The predicted category for", url, "is:", category)


NameError: name 'rf_classifier' is not defined