In [21]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV ,KFold
import pandas as pd
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.metrics import classification_report
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.calibration import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc

In [3]:
df = pd.read_csv('/Users/shahadaleissa/Downloads/Code/Data/colon-dataset-processed.csv')
le = LabelEncoder()
df['Class'] = le.fit_transform(df['Class'])
df

Unnamed: 0,Class,Age,p16540_C/C,p16540_G/C,p16540_G/G,p16580_C/C,p16580_C/T,p16580_T/T,mdm2_G/G,mdm2_G/T,mdm2_T/T,GAL3_A/A,GAL3_C/A,GAL3_C/C,TIM1_C/C,TIM1_G/C,TIM1_G/G
0,1,49,False,True,False,True,False,False,True,False,False,False,True,False,False,True,False
1,1,49,False,True,False,True,False,False,True,False,False,True,False,False,False,True,False
2,1,49,False,True,False,True,False,False,False,True,False,False,True,False,True,False,False
3,1,36,False,True,False,True,False,False,False,False,True,False,True,False,True,False,False
4,1,49,True,False,False,False,True,False,False,False,True,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,0,81,False,True,False,True,False,False,False,False,True,False,False,True,False,False,True
111,0,73,True,False,False,True,False,False,False,False,True,False,False,True,False,False,True
112,0,56,True,False,False,True,False,False,False,True,False,False,False,True,False,True,False
113,0,74,False,True,False,False,True,False,True,False,False,False,False,True,False,False,True


In [4]:
X=df.drop('Class',axis=1)
y=df['Class']

In [5]:

#split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
random_state=123

In [6]:
#overampling
sm = SMOTE(random_state=random_state)
X_oversampled, y_oversampled = sm.fit_resample(X, y)

X_train_oversampled, X_test_oversampled, y_train_oversampled, y_test_oversampled = train_test_split(X_oversampled, y_oversampled, test_size=0.2, random_state=123)


In [28]:
#underampling
# rus = RandomUnderSampler(random_state=random_state)
rus = SMOTEENN(random_state=random_state, sampling_strategy='all')
X_undersampled, y_undersampled = rus.fit_resample(X, y)
X_train_undersampled, X_test_undersampled, y_train_undersampled, y_test_undersampled = train_test_split(X_undersampled, y_undersampled, test_size=0.2, random_state=123)

In [90]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights

array([0.88461538, 1.15      ])

In [1]:

def model_training(X_train, X_test, y_train, y_test, random_state=42):
    # List of models to train
    models = [
        AdaBoostClassifier(random_state=random_state),
        ExtraTreesClassifier(random_state=random_state, n_jobs=-1, class_weight='balanced'), # Parallel processing enabled
        HistGradientBoostingClassifier(random_state=random_state, class_weight='balanced'),
        LGBMClassifier(random_state=random_state, n_jobs=-1, class_weight='balanced'), # Parallel processing enabled
        LogisticRegression(random_state=random_state, n_jobs=-1, class_weight='balanced', max_iter=1000), # Parallel processing enabled
        RandomForestClassifier(max_depth=10, random_state=random_state, n_estimators=100, n_jobs=-1, class_weight='balanced'), # Parallel processing enabled
        SVC(random_state=random_state, kernel='linear', C=1, class_weight='balanced'),
        XGBClassifier(random_state=random_state)
    ]

    # Create an empty DataFrame to store results
    results_df = pd.DataFrame(columns=['Model', 'Testing Accuracy'])

    for model in models:
    
        total_accuracy = 0
        total_training_accuracy = 0
        number_of_iterations = 5

        for i in range(number_of_iterations):
            model.fit(X_train, y_train)
            training_accuracy = model.score(X_train, y_train)
            predictions = model.predict(X_test)
            accuracy = accuracy_score(y_test, predictions)  
            total_accuracy += accuracy
            total_training_accuracy += training_accuracy

        average_accuracy = total_accuracy / number_of_iterations
        average_training_accuracy = total_training_accuracy / number_of_iterations

        
        # Evaluate the model
        precision =precision_score(y_test, predictions)
        recall= recall_score(y_test, predictions)
        cm=confusion_matrix(y_test, predictions)
        f1= f1_score(y_test, predictions)
        
        # Append results to the DataFrame
        temp_df = pd.DataFrame({'Model': [model.__class__.__name__], 'Testing Accuracy': [average_accuracy], 'Training Accuracy': [average_training_accuracy], 'Precision': [precision], 'recall': [recall], 'f1':[f1] 'cm':[cm]})
        results_df = pd.concat([results_df, temp_df], ignore_index=True)


    # Sort and return the results DataFrame
    return results_df.sort_values(by=['Testing Accuracy'], ascending=False)


SyntaxError: invalid syntax. Perhaps you forgot a comma? (998054542.py, line 42)

In [17]:
def cm(df, ada_loc, et_loc, hgb_loc):

    print("Adaboost confusion matrix")
    print(df['cm'].loc[ada_loc])

    print("ExtraTrees confusion matrix")
    print(df['cm'].loc[et_loc])

    print("HGB confusion matrix")
    print(df['cm'].loc[hgb_loc])

In [18]:
original=model_training(X_train, X_test, y_train, y_test)

  results_df = pd.concat([results_df, temp_df], ignore_index=True)


[LightGBM] [Info] Number of positive: 40, number of negative: 52
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38
[LightGBM] [Info] Number of data points in the train set: 92, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 40, number of negative: 52
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38
[LightGBM] [Info] Number of data points in the train set: 92, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start train

In [19]:
original

Unnamed: 0,Model,Testing Accuracy,Training Accuracy,Precision,recall,cm
0,AdaBoostClassifier,0.869565,0.934783,1.0,0.7,"[[13, 0], [3, 7]]"
1,ExtraTreesClassifier,0.782609,1.0,0.727273,0.8,"[[10, 3], [2, 8]]"
2,HistGradientBoostingClassifier,0.782609,0.923913,0.727273,0.8,"[[10, 3], [2, 8]]"
3,LGBMClassifier,0.782609,0.923913,0.727273,0.8,"[[10, 3], [2, 8]]"
4,LogisticRegression,0.782609,0.880435,0.727273,0.8,"[[10, 3], [2, 8]]"
6,SVC,0.782609,0.913043,0.727273,0.8,"[[10, 3], [2, 8]]"
7,XGBClassifier,0.782609,0.978261,0.727273,0.8,"[[10, 3], [2, 8]]"
5,RandomForestClassifier,0.73913,1.0,0.7,0.7,"[[10, 3], [3, 7]]"


In [11]:
cm(original,0,1,2)

Adaboost confusion matrix
[[13  0]
 [ 3  7]]
Xgboost confusion matrix
[[10  3]
 [ 2  8]]
HGB confusion matrix
[[10  3]
 [ 2  8]]


In [12]:
oversample= model_training(X_train_oversampled, X_test_oversampled, y_train_oversampled, y_test_oversampled)

  results_df = pd.concat([results_df, temp_df], ignore_index=True)


[LightGBM] [Info] Number of positive: 50, number of negative: 54
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 104, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 50, number of negative: 54
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000444 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 104, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of posi

In [13]:
oversample

Unnamed: 0,Model,Testing Accuracy,Training Accuracy,Precision,recall,cm
0,AdaBoostClassifier,0.884615,0.951923,0.928571,0.866667,"[[10, 1], [2, 13]]"
1,ExtraTreesClassifier,0.884615,1.0,0.875,0.933333,"[[9, 2], [1, 14]]"
2,HistGradientBoostingClassifier,0.846154,0.913462,0.866667,0.866667,"[[9, 2], [2, 13]]"
6,SVC,0.846154,0.903846,0.866667,0.866667,"[[9, 2], [2, 13]]"
7,XGBClassifier,0.846154,0.980769,0.866667,0.866667,"[[9, 2], [2, 13]]"
3,LGBMClassifier,0.807692,0.894231,0.857143,0.8,"[[9, 2], [3, 12]]"
4,LogisticRegression,0.807692,0.865385,0.8125,0.866667,"[[8, 3], [2, 13]]"
5,RandomForestClassifier,0.769231,1.0,0.8,0.8,"[[8, 3], [3, 12]]"


In [113]:
cm(oversample,0,1,2)

Adaboost confusion matrix
[[10  1]
 [ 2 13]]
ExtraTrees confusion matrix
[[ 9  2]
 [ 1 14]]
HGB confusion matrix
[[ 9  2]
 [ 2 13]]


In [31]:
undersample= model_training(X_train_undersampled, X_test_undersampled, y_train_undersampled, y_test_undersampled)

  results_df = pd.concat([results_df, temp_df], ignore_index=True)


[LightGBM] [Info] Number of positive: 33, number of negative: 36
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000486 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31
[LightGBM] [Info] Number of data points in the train set: 69, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 33, number of negative: 36
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31
[LightGBM] [Info] Number of data points in the train set: 69, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive

In [32]:
undersample

Unnamed: 0,Model,Testing Accuracy,Training Accuracy,Precision,recall,cm
0,AdaBoostClassifier,0.944444,1.0,1.0,0.916667,"[[6, 0], [1, 11]]"
1,ExtraTreesClassifier,0.944444,1.0,1.0,0.916667,"[[6, 0], [1, 11]]"
2,HistGradientBoostingClassifier,0.944444,1.0,1.0,0.916667,"[[6, 0], [1, 11]]"
3,LGBMClassifier,0.944444,1.0,1.0,0.916667,"[[6, 0], [1, 11]]"
4,LogisticRegression,0.944444,1.0,1.0,0.916667,"[[6, 0], [1, 11]]"
5,RandomForestClassifier,0.944444,1.0,1.0,0.916667,"[[6, 0], [1, 11]]"
6,SVC,0.944444,1.0,1.0,0.916667,"[[6, 0], [1, 11]]"
7,XGBClassifier,0.944444,1.0,1.0,0.916667,"[[6, 0], [1, 11]]"


In [33]:
cm(undersample,0,1,2)

Adaboost confusion matrix
[[ 6  0]
 [ 1 11]]
ExtraTrees confusion matrix
[[ 6  0]
 [ 1 11]]
HGB confusion matrix
[[ 6  0]
 [ 1 11]]
