In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import export_text
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report as cr
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb

# Loading the Dataset

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ML_project/full_encoded_data_2_no_purpose.csv")



# Splitting the dataset

In [None]:
data_cleaned = data.dropna(subset = ['Label'])
X = data_cleaned.drop(columns = ['Label'])
y = data_cleaned['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 78)

# Standardizing the data

In [None]:
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

# Grid Search

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [8, 20, None],
}

grid_search = GridSearchCV(estimator = DecisionTreeClassifier(random_state = 78), param_grid = param_grid, cv = 5)
grid_search.fit(X_train_standardized, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Best Parameters: {best_params}")
print(f"Testing Accuracy: {accuracy * 100}%")

# Decision Tree

In [None]:
best_model = DecisionTreeClassifier(max_depth = 8, random_state = 78, criterion = 'gini')

best_model.fit(X_train_standardized, y_train)

predictions = best_model.predict(X_test_standardized)

pred_train = best_model.predict(X_train_standardized)
acc_train = accuracy_score(y_train, pred_train)
print(f"Training Accuracy: {acc_train * 100}%")
test_acc = accuracy_score(y_test, predictions)
print(f"Testing Accuracy: {test_acc * 100}%")

Training Accuracy: 93.67821886043907%
Testing Accuracy: 93.71179344699004%


# Plot of Decision Tree

In [None]:
class_names = [str(label) for label in best_model.classes_]

plt.figure(figsize = (20, 16))
tree.plot_tree(best_model, filled = True, feature_names = X.columns, class_names = class_names)
plt.show()

# Classification Report

In [None]:
target = ['0','1']
print(cr(y_test,predictions,))

# Function to plot Confusion Matrix

In [None]:
def plot_confusion_matrix(y_test, y_pred):

  conf_mat = confusion_matrix(y_test, y_pred)
  tn,fp,fn,tp = conf_mat.ravel()

  plt.figure(figsize = (8,6))
  sns.heatmap([[tp,fp],[fn,tn]],annot = True,fmt = 'd', cmap = 'plasma', xticklabels = ['Positive', 'Negative'], yticklabels = ['Positive', 'Negative'])
  plt.xlabel('True Label')
  plt.ylabel('Predicted Label')
  plt.title('Confusion Matrix')
  plt.show()

  precision = tp/(tp + fp)
  recall = tp/(tp + fn)
  f1_score = 2 * (precision * recall)/(precision + recall)

  print(f'Precision: {precision}')
  print(f'Recall: {recall}')
  print(f'F1 Score: {f1_score}')

# Confusion matrix for Decision Tree

In [None]:
plot_confusion_matrix(y_test,predictions)

# Function to plot ROC Curve

In [None]:
def plot_roc_curve(y_true, probabilities):

    fpr, tpr, thresholds = roc_curve(y_true, probabilities)
    roc_auc = roc_auc_score(y_true, probabilities)

    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, color = 'orange', lw = 2, label = f'ROC curve')
    plt.plot([0, 1], [0, 1], color = 'blue', lw = 2, linestyle = '--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc = 'lower right')
    plt.show()

    print(f'AUC: {roc_auc}')

# ROC Curve for Decision Tree

In [None]:
probabilities = best_model.predict_proba(X_test_standardized)[:, 1]
plot_roc_curve(y_test,probabilities)

# Decision Tree parameters and properties

In [None]:
tree_params = best_model.get_params()
print("Decision Tree Parameters:", tree_params)

In [None]:
num_leaves = best_model.get_n_leaves()
print("Number of Leaves:", num_leaves)

num_nodes = best_model.tree_.node_count
print("Number of Nodes:", num_nodes)

max_depth = best_model.tree_.max_depth
print("Maximum Depth:", max_depth)

Number of Leaves: 125
Number of Nodes: 249
Maximum Depth: 8


# Important Features

In [None]:
feature_importances = best_model.feature_importances_
feature_dict = dict(zip(X_train.columns,feature_importances))

sorted_feature_importance = sorted(feature_dict.items(), key = lambda x: x[1], reverse = True)

print("Feature Importances:")
c = 0
for feature,importance in sorted_feature_importance:
    print(f"{feature}: {importance:.4f}")
    c = c+1
    if(c == 5):
      break

#RANDOM FOREST

In [None]:
clean_data = data.dropna(subset = ['Label'])
X = clean_data.drop(columns = ['Label'])
y = clean_data['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 78)

scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

random_forest_classifier = RandomForestClassifier(n_estimators = 150, max_depth = 8 , min_samples_split = 2 , min_samples_leaf = 1, random_state = 78)

random_forest_classifier.fit(X_train_standardized,y_train)

y_pred = random_forest_classifier.predict(X_test_standardized)

pred = random_forest_classifier.predict(X_train_standardized)
train_acc = accuracy_score(y_train, pred)
print(f"Training Accuracy: {train_acc * 100}%")

accuracy = accuracy_score(y_test,y_pred)
print(f"Testing Accuracy: {accuracy * 100}%")

# Classification Report

In [None]:
target = ['0','1']
print(cr(y_test, y_pred,))

# Grid Search

In [None]:
param_grid = {
    'n_estimators': [100,150,200],
    'max_depth': [6,8,10,None]
}


random_forest_classifier = RandomForestClassifier(random_state = 78)

grid_search = GridSearchCV(estimator = random_forest_classifier, param_grid = param_grid, cv = 5)

grid_search.fit(X_train_standardized, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_test_standardized)

accuracy = accuracy_score(y_test, predictions)
print(f"Best Parameters: {best_params}")
print(f"Testing Accuracy: {accuracy * 100}%")

# Confusion matrix for Random Forest

In [None]:
plot_confusion_matrix(y_test, y_pred)

# ROC Curve for Random Forest

In [None]:
prob = random_forest_classifier.predict_proba(X_test_standardized)[:, 1]
plot_roc_curve(y_test,prob)

# Random Forest properties

In [None]:
print("Number of Trees (n_estimators):",random_forest_classifier.n_estimators)
print("max_depth:", random_forest_classifier.max_depth)
print("min_samples_split:", random_forest_classifier.min_samples_split)
print("min_samples_leaf:", random_forest_classifier.min_samples_leaf)

# Important Features

In [None]:
feature_importances = random_forest_classifier.feature_importances_
feature_dict = dict(zip(X_train.columns,feature_importances))

sorted_feature_importance = sorted(feature_dict.items(), key = lambda x: x[1], reverse = True)

print("Feature Importances:")
c = 0
for feature,importance in sorted_feature_importance:
    print(f"{feature}: {importance:.4f}")
    c = c+1
    if(c == 5):
      break

# XGB BOOST

# Grid Search

In [None]:
xgb_model = XGBClassifier(random_state = 78)

param_grid = {
    'learning_rate': [0.1, 0.4],
    'max_depth': [8, 20, 32],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'n_estimators': [200, 300]
}

grid_search = GridSearchCV(estimator = xgb_model, param_grid = param_grid, scoring = 'accuracy', cv = 3, verbose = 1)

grid_search.fit(X_train_standardized, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_standardized)

acc = accuracy_score(y_test, y_pred)
print(f"Testing Accuracy: {acc * 100}%")

In [None]:
data_cleaned = data.dropna(subset = ['Label'])
X = data_cleaned.drop(columns = ['Label'])
y = data_cleaned['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 78)

scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

model = XGBClassifier(
    booster = 'gbtree',
    learning_rate = 0.1,
    max_depth = 20,
    subsample = 0.8,
    colsample_bytree = 0.8,
    n_estimators = 200,
    random_state = 78
)
model.fit(X_train_standardized, y_train)


pred = model.predict(X_train_standardized)
train_acc = accuracy_score(y_train, pred)
print(f"Training Accuracy: {train_acc * 100}%")

y_pred = model.predict(X_test_standardized)
test_acc = accuracy_score(y_test, y_pred)
print(f"Testing Accuracy: {test_acc * 100}%")

Training Accuracy: 99.86158075863554%
Testing Accuracy: 93.63486376771051%


# Classification Report

In [None]:
target = ['0','1']
print(cr(y_test,y_pred,))

# Confusion Matrix for XGBoost

In [None]:
plot_confusion_matrix(y_test,y_pred)

# ROC Curve for XGBoost

In [None]:
prob = model.predict_proba(X_test_standardized)[:, 1]
plot_roc_curve(y_test,prob)

# Important Features

In [None]:
feature_importances = model.feature_importances_
feature_dict = dict(zip(X_train.columns,feature_importances))

sorted_feature_importance = sorted(feature_dict.items(), key = lambda x: x[1], reverse = True)

print("Feature Importances:")
c = 0
for feature,importance in sorted_feature_importance:
    print(f"{feature}: {importance:.4f}")
    c = c+1
    if(c == 5):
      break

# MLP

In [None]:
data_cleaned = data.dropna(subset = ['Label'])
X = data_cleaned.drop(columns = ['Label'])
y = data_cleaned['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 78)

scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes = (64,32,16,8,4,2),
                      activation = 'relu',
                      early_stopping = True,
                      max_iter = 200,
                      batch_size = 128,
                      solver = 'adam',
                      verbose = True,
                      learning_rate_init = 1e-4,
                      )

history = mlp.fit(X_train_standardized, y_train)

training_loss = history.loss_curve_

plt.figure(figsize = (11, 7))
plt.plot(training_loss, label = 'Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

train_predictions = mlp.predict(X_train_standardized)
test_predictions = mlp.predict(X_test_standardized)

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print()
print(f"Training Accuracy: {train_accuracy}")
print(f"Testing Accuracy: {test_accuracy}")

# Classification Report

In [None]:
target = ['0','1']
print(cr(y_test, test_predictions,))

# Confusion matrix for MLP

In [None]:
plot_confusion_matrix(y_test, test_predictions)

# ROC Curve for MLP

In [None]:
probabilities = mlp.predict_proba(X_test_standardized)[:, 1]
plot_roc_curve(y_test,probabilities)

# LightGBM

In [None]:
train_data = lgb.Dataset(X_train_standardized, label = y_train)
test_data = lgb.Dataset(X_test_standardized, label = y_test, reference = train_data)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8
}


num_round = 200
bst = lgb.train(params, train_data, num_round, valid_sets = [test_data])

y_train_pred = bst.predict(X_train_standardized, num_iteration = bst.best_iteration)
y_train_pred_binary = [1 if x >= 0.5 else 0 for x in y_train_pred]

y_test_pred = bst.predict(X_test_standardized, num_iteration = bst.best_iteration)
y_test_pred_binary = [1 if x >= 0.5 else 0 for x in y_test_pred]

train_accuracy = accuracy_score(y_train, y_train_pred_binary)
print(f"Training Accuracy: {train_accuracy}")

test_accuracy = accuracy_score(y_test, y_test_pred_binary)
print(f"Test Accuracy: {test_accuracy}")

[LightGBM] [Info] Number of positive: 145183, number of negative: 588819
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.061788 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4537
[LightGBM] [Info] Number of data points in the train set: 734002, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197796 -> initscore=-1.400124
[LightGBM] [Info] Start training from score -1.400124
Training Accuracy: 0.936892542527132
Test Accuracy: 0.9371624392430374


# Classification Report

In [None]:
target = ['0','1']
print(cr(y_test,y_test_pred_binary,))

# Confusion Matrix for LightGBM

In [None]:
plot_confusion_matrix(y_test,y_test_pred_binary)

# ROC Curve for LightGBM

In [None]:
y_test_raw_pred = bst.predict(X_test_standardized, num_iteration = bst.best_iteration)
y_test_prob = 1 / (1 + np.exp(-y_test_raw_pred))
plot_roc_curve(y_test,y_test_prob)

# Important Features

In [None]:
lgb.plot_importance(bst, max_num_features = 5, figsize = (10, 6), importance_type = 'split')
plt.show()