In [2]:
import json

In [1]:
def append_evaluation_metrics_to_csv(model_name, evaluation_metrics,dataset_name,best_params,fit_time,predict_time, filename='model_evaluation_metrics.csv'):
    try:
        # Load existing CSV file
        df_metrics = pd.read_csv(filename)
    except FileNotFoundError:
        # If the file doesn't exist yet, create an empty DataFrame
        df_metrics = pd.DataFrame(columns=['Model_Name','Dataset','Parameters','Accuracy', 'Precision', 'Recall', 'F1_Score', 'Fit_Time','Predict_time'])

    # Convert the best_params dictionary to a JSON string
    best_params_str = json.dumps(best_params)


    # Create a DataFrame with the new metrics
    new_row = pd.DataFrame([[model_name,dataset_name,best_params_str, evaluation_metrics['accuracy'], evaluation_metrics['precision'],
                             evaluation_metrics['recall'], evaluation_metrics['f1_score'], fit_time,predict_time]],
                           columns=['Model_Name', 'Dataset','Parameters','Accuracy', 'Precision', 'Recall', 'F1_Score', 'Fit_Time','Predict_time'])

    # Append the new row to the existing DataFrame
    df_metrics = pd.concat([df_metrics, new_row], ignore_index=True)

    # Save the updated DataFrame to the CSV file
    df_metrics.to_csv(filename, index=False)


In [3]:
import pandas as pd
import time as t

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

from lightgbm import LGBMClassifier

#from evaluation import append_evaluation_metrics_to_csv


class LightGBM:

    def __init__(self, num_leaves=31, learning_rate=0.1, n_estimators=100, max_depth=-1, min_child_samples=20, random_state=123):
        '''
        Initialize the LightGBM classifier
        :param num_leaves: Maximum tree leaves for base learners
        :param learning_rate: Learning rate for training
        :param n_estimators: Number of boosting iterations
        :param max_depth: Maximum depth of the tree
        :param min_child_samples: Minimum number of data needed in a child
        :param random_state: Random state for reproducibility
        '''
        self.num_leaves = num_leaves
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_child_samples = min_child_samples
        self.random_state = random_state
        self.lgbm = None

    def fit(self, X_train, y_train):
        '''
        Fit the LightGBM classifier to the training data
        :param X_train: The training features
        :param y_train: The training labels
        '''
        self.lgbm = LGBMClassifier(num_leaves=self.num_leaves, learning_rate=self.learning_rate,
                                    n_estimators=self.n_estimators, max_depth=self.max_depth,
                                    min_child_samples=self.min_child_samples, random_state=self.random_state)
        self.lgbm.fit(X_train, y_train)

    def predict(self, X_test):
        '''
        Make predictions using the trained LightGBM classifier
        :param X_test: The test features
        :return: Predicted labels
        '''
        if self.lgbm is None:
            raise ValueError("LightGBM classifier has not been trained yet. Call fit() first.")
        return self.lgbm.predict(X_test)





In [6]:
df = pd.read_csv('df_selection2.csv')
#df = df.drop('Unnamed: 0',axis=1)
dataset_name = 'df_selection2.csv'
    # Split the data into features (X) and target variable (y)
X = df.drop('url', axis=1)
X = X.drop('status', axis=1)  # Features
y = df['status']  # Target variable

    # Random state
rs = 123

    # Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)


In [21]:

    # Define the hyperparameters grid to search
param_grid_lgbm = {
        'num_leaves': [10, 50, 100],             # Maximum tree leaves for base learners
        'learning_rate': [0.05, 0.1, 0.2],       # Learning rate for training
        'n_estimators': [100, 200],         # Number of boosting iterations
        'max_depth': [-1, 5, 10],                # Maximum depth of the tree
        'min_child_samples': [20, 30, 40],       # Minimum number of data needed in a child
    }
print(type(param_grid_lgbm))

    # Instantiate GridSearchCV
grid_search_lgbm = GridSearchCV(LGBMClassifier(random_state=rs), param_grid_lgbm, cv=5)

    # Fit the grid search to the data
grid_search_lgbm.fit(X_train, y_train)

print('hola')
    # Get the best parameters and best score
best_params = grid_search_lgbm.best_params_
best_score = grid_search_lgbm.best_score_

print("Best parameters found:", best_params)
print("Best accuracy score:", best_score)




[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
[LightGBM] [Info] Number of positive: 3696, number of negative: 3619
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001873 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1809
[LightGBM] [Info] Number of data points in the train set: 7315, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505263 -> initscore=0.021053
[LightGBM] [Info] Start training from score 0.021053
[LightGBM] [Info] Number of positive: 3696, number of negative: 3620
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001787 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1801
[LightGBM] [Info] Number of data points in 

In [22]:
# Instantiate the LightGBM object with desired parameters
lgbm_model = LightGBM(**best_params, random_state=rs)

start_time = t.time()

    # Fit the model to the training data
lgbm_model.fit(X_train, y_train)

# Calculate time taken
end_time = t.time()
fit_time = end_time - start_time

start_time= t.time()
# Make predictions on the test data
predictions = lgbm_model.predict(X_test)

    # Calculate time taken
end_time = t.time()
predict_time = end_time - start_time

[LightGBM] [Info] Number of positive: 4620, number of negative: 4524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002231 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1863
[LightGBM] [Info] Number of data points in the train set: 9144, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505249 -> initscore=0.020998
[LightGBM] [Info] Start training from score 0.020998


In [23]:
# Evaluate the model
model_name = "LightGBM"
lgbm_accuracy = accuracy_score(y_test, predictions)
lgbm_precision = precision_score(y_test, predictions)
lgbm_recall = recall_score(y_test, predictions)
lgbm_f1 = f1_score(y_test, predictions)

print("Test accuracy:", lgbm_accuracy)
print("Test precision:", lgbm_precision)
print("Test recall:", lgbm_recall)
print("Test f1 score:", lgbm_f1)

    # Prepare evaluation metrics
evaluation_metrics = {
        'accuracy': lgbm_accuracy,
        'precision': lgbm_precision,
        'recall': lgbm_recall,
        'f1_score': lgbm_f1
    }
    # Call the function to append the new evaluation metrics to the existing CSV file
append_evaluation_metrics_to_csv(model_name, evaluation_metrics,dataset_name,best_params, fit_time,predict_time)


Test accuracy: 0.9203849518810149
Test precision: 0.9262371615312792
Test recall: 0.9059360730593607
Test f1 score: 0.9159741458910435


In [8]:
eval = pd.read_csv("model_evaluation_metrics.csv")
print(eval)

              Model_Name                   Dataset  \
0           DecisionTree  df_complete_unscaled.csv   
1           RandomForest  df_complete_unscaled.csv   
2   KNeighborsClassifier  df_complete_unscaled.csv   
3                    SVM  df_complete_unscaled.csv   
4           DecisionTree   df_lexical_unscaled.csv   
5           RandomForest   df_lexical_unscaled.csv   
6   KNeighborsClassifier   df_lexical_unscaled.csv   
7                    SVM   df_lexical_unscaled.csv   
8           DecisionTree         df_selection1.csv   
9           RandomForest         df_selection1.csv   
10  KNeighborsClassifier         df_selection1.csv   
11                   SVM         df_selection1.csv   
12          DecisionTree         df_selection2.csv   
13          RandomForest         df_selection2.csv   
14  KNeighborsClassifier         df_selection2.csv   
15                   SVM         df_selection2.csv   
16              LightGBM  df_complete_unscaled.csv   
17              LightGBM   d

# XGB

In [4]:
from xgboost import XGBClassifier

class XGBoost:

    def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, min_child_weight=1, gamma=0, subsample=1, colsample_bytree=1, random_state=123):
        '''
        Initialize the XGBoost classifier
        :param max_depth: Maximum depth of the tree
        :param learning_rate: Learning rate for training
        :param n_estimators: Number of boosting iterations
        :param min_child_weight: Minimum sum of instance weight needed in a child
        :param gamma: Minimum loss reduction required to make a further partition on a leaf node
        :param subsample: Subsample ratio of the training instances
        :param colsample_bytree: Subsample ratio of columns when constructing each tree
        :param random_state: Random state for reproducibility
        '''
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.min_child_weight = min_child_weight
        self.gamma = gamma
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.random_state = random_state
        self.xgb = None

    def fit(self, X_train, y_train):
        '''
        Fit the XGBoost classifier to the training data
        :param X_train: The training features
        :param y_train: The training labels
        '''
        self.xgb = XGBClassifier(max_depth=self.max_depth, learning_rate=self.learning_rate,
                                  n_estimators=self.n_estimators, min_child_weight=self.min_child_weight,
                                  gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree,
                                  random_state=self.random_state)
        self.xgb.fit(X_train, y_train)

    def predict(self, X_test):
        '''
        Make predictions using the trained XGBoost classifier
        :param X_test: The test features
        :return: Predicted labels
        '''
        if self.xgb is None:
            raise ValueError("XGBoost classifier has not been trained yet. Call fit() first.")
        return self.xgb.predict(X_test)


In [9]:
# Define the hyperparameters grid to search
param_grid_xgb = {
    'max_depth': [None, 3, 5, 10],                   # Maximum depth of the tree
    'learning_rate': [0.05, 0.1, 0.2],         # Learning rate for training
    'n_estimators': [100, 200],                # Number of boosting iterations
    'min_child_weight': [1, 5, 10],            # Minimum sum of instance weight needed in a child
    'gamma': [0, 0.1, 0.2],                    # Minimum loss reduction required to make a further partition on a leaf node
}

# Instantiate GridSearchCV
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=rs), param_grid_xgb, cv=5)

# Fit the grid search to the data
grid_search_xgb.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search_xgb.best_params_
best_score = grid_search_xgb.best_score_

print("Best parameters found:", best_params)
print("Best accuracy score:", best_score)

Best parameters found: {'gamma': 0.2, 'learning_rate': 0.2, 'max_depth': None, 'min_child_weight': 1, 'n_estimators': 200}
Best accuracy score: 0.9125095561061574


In [10]:
# Instantiate the XGB object with desired parameters
xgb_model = XGBoost(**best_params, random_state=rs)

start_time = t.time()

# Fit the model to the training data
xgb_model.fit(X_train, y_train)

# Calculate time taken for training
end_time = t.time()
fit_time = end_time - start_time

start_time = t.time()
# Make predictions on the test data
predictions = xgb_model.predict(X_test)

# Calculate time taken for prediction
end_time = t.time()
predict_time = end_time - start_time


In [11]:
# Evaluate the model
model_name = "XGBoost"
xgb_accuracy = accuracy_score(y_test, predictions)
xgb_precision = precision_score(y_test, predictions)
xgb_recall = recall_score(y_test, predictions)
xgb_f1 = f1_score(y_test, predictions)

print("Test accuracy:", xgb_accuracy)
print("Test precision:", xgb_precision)
print("Test recall:", xgb_recall)
print("Test f1 score:", xgb_f1)

# Prepare evaluation metrics
evaluation_metrics = {
    'accuracy': xgb_accuracy,
    'precision': xgb_precision,
    'recall': xgb_recall,
    'f1_score': xgb_f1
}

# Call the function to append the new evaluation metrics to the existing CSV file
append_evaluation_metrics_to_csv(model_name, evaluation_metrics, dataset_name, best_params, fit_time, predict_time)


Test accuracy: 0.9212598425196851
Test precision: 0.9240037071362373
Test recall: 0.9105022831050228
Test f1 score: 0.9172033118675251
