In [2]:
%pip uninstall lightgbm
%pip install lightgbm

Found existing installation: lightgbm 4.1.0
Uninstalling lightgbm-4.1.0:
  Would remove:
    /usr/local/python/3.10.8/lib/python3.10/site-packages/lightgbm-4.1.0.dist-info/*
    /usr/local/python/3.10.8/lib/python3.10/site-packages/lightgbm/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

lgbModel=lgb

# Load your data
data = pd.read_csv('Data/colon-dataset-processed.csv')
X = data.drop('Class', axis=1).values
y = data['Class'].map({'healthy': 0, 'diagnosed': 1}).values

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a LightGBM dataset
train_data = lgbModel.Dataset(X_train, label=y_train)
test_data = lgbModel.Dataset(X_test, label=y_test, reference=train_data)

# Set up the model parameters
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Create a callback for logging the evaluation metric
log_evaluation_callback = lgbModel.log_evaluation(period=10)

# Train the model
num_round = 100
bst = lgbModel.train(params, train_data, num_round, valid_sets=[test_data], callbacks=[log_evaluation_callback])

# Make predictions
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
y_pred_binary = [1 if p >= 0.5 else 0 for p in y_pred]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Accuracy: {accuracy * 100:.2f}%')



[LightGBM] [Info] Number of positive: 53, number of negative: 39
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 92, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.576087 -> initscore=0.306730
[LightGBM] [Info] Start training from score 0.306730
[10]	valid_0's binary_error: 0.217391
[20]	valid_0's binary_error: 0.217391
[30]	valid_0's binary_error: 0.217391
[40]	valid_0's binary_error: 0.217391
[50]	valid_0's binary_error: 0.217391
[60]	valid_0's binary_error: 0.217391
[70]	valid_0's binary_error: 0.217391
[80]	valid_0's binary_error: 0.217391
[90]	valid_0's binary_error: 0.217391
[100]	valid_0's binary_error: 0.217391
Accuracy: 78.26%


optimization using Gridsearch

In [4]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd

# Load your data
data = pd.read_csv('Data/colon-dataset-processed.csv')
X = data.drop('Class', axis=1).values
y = data['Class'].map({'healthy': 0, 'diagnosed': 1}).values

# Define the model
clf = lgbModel.LGBMClassifier(
    objective='binary',
    metric='binary_error',
    boosting_type='gbdt'
)

# Define the parameter grid
param_grid = {
    'num_leaves': [7, 15, 31, 63],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.6, 0.8, 1.0],
    'n_estimators': [50, 100, 200]
}

# Choose a cross-validation strategy
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Set up the GridSearchCV object
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid,
                           cv=kf, scoring='accuracy', verbose=1)

# Perform grid search
grid_search.fit(X, y)

# Best parameters and best score
print(f'Best parameters found: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_:.2f}')

# Evaluate the best model on the full dataset
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X)
print('\nOverall Accuracy:', accuracy_score(y, y_pred))
print('Overall Confusion Matrix:')
print(confusion_matrix(y, y_pred))
print('\nClassification Report:')
print(classification_report(y, y_pred))


FileNotFoundError: [Errno 2] No such file or directory: '/content/colon-dataset-processed copy.csv'

using 5 split startified K fold cross Validation : Highest accuracy

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
import numpy as np

# Load your data
data = pd.read_csv('/content/colon-dataset-processed copy.csv')
X = data.drop('Class', axis=1).values
y = data['Class'].map({'healthy': 0, 'diagnosed': 1}).values

# Define the LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Choose a cross-validation strategy
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Arrays to store scores and predictions
cv_scores = []
confusion_matrices = []

# Perform cross-validation
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Create a LightGBM dataset
    train_data = lgbModel.Dataset(X_train, label=y_train)

    # Train the model
    bst = lgbModel.train(params, train_data, num_boost_round=100)

    # Make predictions
    y_pred = bst.predict(X_test)
    y_pred_binary = [1 if p >= 0.5 else 0 for p in y_pred]

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred_binary)
    cv_scores.append(accuracy)

    # Confusion Matrix
    conf_mat = confusion_matrix(y_test, y_pred_binary)
    confusion_matrices.append(conf_mat)

# Output the mean accuracy across all folds
print(f'CV Accuracy: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}')

# Calculate and print the overall confusion matrix
overall_confusion_matrix = np.sum(confusion_matrices, axis=0)
print('Overall Confusion Matrix:')
print(overall_confusion_matrix)

# Print classification report for the last fold
print('\nClassification Report for the last fold:')
print(classification_report(y_test, y_pred_binary))
