<a href="https://colab.research.google.com/github/saradelasota/TFM_repository/blob/main/TFM_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TFM

In [None]:
# IMPORT LIBRAIES
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import time


#TREE
from sklearn.tree import DecisionTreeClassifier
#RandomForest
from sklearn.ensemble import RandomForestClassifier


In [None]:
model_evaluations={}

## DATA PREPARATION

In [None]:
# Read the CSV file
df = pd.read_csv('dataset_B_05_2020.csv')  # Replace 'your_dataset.csv' with the path to your dataset


The data set contains 87 extracted features, the column with the url, and the status column. Features are from three different classes: 56 extracted from the structure and syntax of URLs, 24 extracted from the content of their correspondent pages and 7 are extracetd by querying external services.

In [None]:
print(len(df.columns))
df.columns

89


Index(['url', 'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens',
       'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore',
       'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma',
       'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com',
       'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
       'ratio_digits_host', 'punycode', 'port', 'tld_in_path',
       'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains',
       'prefix_suffix', 'random_domain', 'shortening_service',
       'path_extension', 'nb_redirection', 'nb_external_redirection',
       'length_words_raw', 'char_repeat', 'shortest_words_raw',
       'shortest_word_host', 'shortest_word_path', 'longest_words_raw',
       'longest_word_host', 'longest_word_path', 'avg_words_raw',
       'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand',
       'brand_in_subdomain', 'brand_in_path', 'suspecious_tld',
       'statistical_report', 

In [None]:
df.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [None]:
content_features = ['nb_hyperlinks','ratio_intHyperlinks','ratio_extHyperlinks','ratio_nullHyperlinks',
    'nb_extCSS','ratio_intRedirection','ratio_extRedirection','ratio_intErrors','ratio_extErrors',
    'login_form','external_favicon','links_in_tags','submit_email','ratio_intMedia','ratio_extMedia',
    'sfh','iframe','popup_window','safe_anchor','onmouseover','right_clic','empty_title',
    'domain_in_title','domain_with_copyright']

external_features = ['whois_registered_domain','domain_registration_length','domain_age',
    'web_traffic','dns_record','google_index','page_rank']

df.drop(content_features, axis=1, inplace=True)
df.drop(external_features, axis=1, inplace=True)

print(len(df.columns))


58


In [None]:
# Define a dictionary to map string values to binary values
status_mapping = {'phishing': 0, 'legitimate': 1}

# Map the 'status' variable using the dictionary
df['status'] = df['status'].map(status_mapping)


In [None]:
# Split the data into features (X) and target variable (y)
X = df.drop('url', axis=1)
X = X.drop('status', axis=1)  # Features
y = df['status']  # Target variable

# Random state
rs = 123

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)


In [None]:
# Select columns with object (string) data type
string_columns = X.select_dtypes(include=['object'])

# Check if there are any string columns
if not string_columns.empty:
    print("String columns:")
    print(string_columns.columns)
else:
    print("No string columns found.")


No string columns found.


## DECISION TREE

In [None]:
class DecisionTree:

    def __init__(self, max_depth=None, min_samples_split=2, max_features=None, criterion='entropy'):
        '''
        Initialize the decision tree
        :param max_depth: maximum depth of the tree
        :param min_samples_split: minimum number of samples required for a split
        :param random_state: random state
        :param criterion: measure function to use for best_partition, default to entropy
        '''

        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.criterion = criterion
        self.tree = None

    def fit(self, X_train, y_train):
        '''
        Fit the decision tree to the training data.
        :param X: training features
        :param y: training labels
        '''
        self.tree = DecisionTreeClassifier(
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            max_features=self.max_features,
            random_state= rs
        )
        self.tree.fit(X_train, y_train)

    def predict(self, X_train):
        '''
        Make predictions using the trained decision tree.
        :param X: features of the data to be predicted
        :return: array of predicted labels
        '''
        if self.tree is None:
            raise ValueError("Decision tree has not been trained yet. Call fit() first.")

        predictions = self.tree.predict(X_train)
        return predictions



## HPO for decision tree

In [71]:
# Define the parameters grid to search
param_grid = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'criterion':['gini','entropy','log_loss']
}

# Define the scoring metric
scorer = make_scorer(accuracy_score)

# Instantiate the decision tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)

# Instantiate GridSearchCV
grid_search = GridSearchCV(decision_tree, param_grid, scoring=scorer, cv=5)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters found:", best_params)
print("Best accuracy score:", best_score)


Best parameters found: {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_split': 10}
Best accuracy score: 0.8694221352319127


### Evaluation of the method

In [None]:
# Instantiate the DecisionTree object with desired parameters
decision_tree_model = DecisionTree(max_depth=None, min_samples_split=10, max_features=None, criterion='entropy')

# Fit the model to the training data
decision_tree_model.fit(X_train, y_train)

start_time = time.time()

# Make predictions on the test data
predictions = decision_tree_model.predict(X_test)

# Calculate time taken
end_time = time.time()
time = end_time - start_time


# Evaluate the model
dt_accuracy = accuracy_score(y_test, predictions)
dt_precision = precision_score(y_test, predictions)
dt_recall = recall_score(y_test, predictions)
dt_f1 = f1_score(y_test, predictions)


print("Test accuracy:", accuracy)
print("Test precision:", dt_precision)
print("Test recall:", dt_recall)
print("Test f1 score:", dt_f1)

Test accuracy: 0.8797025371828522
Test precision: 0.8860640301318268
Test recall: 0.8593607305936073
Test f1 score: 0.8725081131200741


In [None]:
# Store results in the dictionary
model_evaluations['DecisionTree'] = {
        "accuracy": dt_accuracy,
        "precision": dt_precision,
        "recall": dt_recall,
        "f1": dt_f1,
        "predicting time": time}

In [None]:
model_evaluations

{'DecisionTree': {'accuracy': 0.8797025371828522,
  'precision': 0.8860640301318268,
  'recall': 0.8593607305936073,
  'f1': 0.8725081131200741,
  'predicting time': 0.0036630630493164062}}

## RANDOM FOREST

In [77]:
class RandomForestModel:

    def __init__(self, n_estimators=100, max_depth=None, criterion=None,max_features=None):
        '''
        Initialize the Random Forest model
        :param n_estimators: The number of trees in the forest
        :param max_depth: The maximum depth of the tree
        :param random_state: Controls both the randomness of the bootstrapping of the samples used when building trees
                              (if `bootstrap=True`) and the sampling of the features to consider when looking for the best split at each node.
        :param criterion:
        :param max_features:
        '''
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.criterion = criterion
        self.max_features = max_features
        self.rf = None


    def fit(self, X_train, y_train):
        '''
        Fit the Random Forest model to the training data
        :param X_train: The training features
        :param y_train: The training labels
        '''
        self.rf = RandomForestClassifier(n_estimators=self.n_estimators, criterion= self.criterion,
                                             max_depth=self.max_depth,max_features=self.max_features,
                                             random_state= rs)
        self.rf.fit(X_train, y_train)

    def predict(self, X_test):
        '''
        Make predictions using the trained Random Forest model
        :param X_test: The test features
        :return: Predicted labels
        '''
        if self.rf is None:
            raise ValueError("Random Forest model has not been trained yet. Call fit() first.")
        return self.rf.predict(X_test)


