### Modelling

In [7]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
data = pd.read_csv('../data/MRI_data_2.csv')
feats = data.columns[:-2]
X_train, X_test, y_train, y_test = train_test_split(data[feats], data['Diagnosis'],test_size=.2,random_state =123)
print(X_train.shape)
print(X_test.shape)
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
pd.DataFrame(models)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.86,0.87,,0.86,1.55
XGBClassifier,0.82,0.83,,0.82,1.61
ExtraTreesClassifier,0.81,0.83,,0.8,0.21
SVC,0.8,0.82,,0.8,0.11
NuSVC,0.8,0.82,,0.8,0.11
RandomForestClassifier,0.79,0.81,,0.79,0.68
QuadraticDiscriminantAnalysis,0.79,0.81,,0.79,0.06
LogisticRegression,0.78,0.78,,0.78,0.08
BaggingClassifier,0.75,0.77,,0.75,0.63
LinearSVC,0.76,0.76,,0.76,0.38


In [15]:
# Importing necessary libraries
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import numpy as np

params = {
    'boosting_type': 'dart',         # Gradient Boosting Decision Tree
    'objective': 'multiclass',           # Binary classification
    'num_classes':3,
     'metric': 'multi_logloss',
    'num_leaves': 31,                # Maximum number of leaves in one tree
    'learning_rate': 0.05,           # Learning rate of boosting process
    'feature_fraction': 0.9,         # Percentage of features to be used per iteration
    'bagging_fraction': 0.8,         # Percentage of data to be bagged per iteration
    'bagging_freq': 5,               # Frequency for bagging
    'verbose': -1,                    # < 0: Fatal, = 0: Error (Warning), = 1: Info, > 1: Debug,
}

# Creating dataset for LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Training the model
num_round = 1000  # Number of boosting rounds
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data],  callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(50)])

# Making predictions
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
# Find the class with the highest probability for each sample
y_pred_class = np.argmax(y_pred, axis=1)
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy)

[50]	valid_0's multi_logloss: 0.608383
[100]	valid_0's multi_logloss: 0.528154
[150]	valid_0's multi_logloss: 0.490461
[200]	valid_0's multi_logloss: 0.454068
[250]	valid_0's multi_logloss: 0.426952
[300]	valid_0's multi_logloss: 0.412225
[350]	valid_0's multi_logloss: 0.397104
[400]	valid_0's multi_logloss: 0.388652
[450]	valid_0's multi_logloss: 0.374993
[500]	valid_0's multi_logloss: 0.378371
[550]	valid_0's multi_logloss: 0.376556
[600]	valid_0's multi_logloss: 0.374549
[650]	valid_0's multi_logloss: 0.379638
[700]	valid_0's multi_logloss: 0.377471
[750]	valid_0's multi_logloss: 0.382144
[800]	valid_0's multi_logloss: 0.386162
[850]	valid_0's multi_logloss: 0.39196
[900]	valid_0's multi_logloss: 0.395388
[950]	valid_0's multi_logloss: 0.41289
[1000]	valid_0's multi_logloss: 0.423835
Accuracy: 0.8712871287128713


In [29]:
y_train.value_counts()

0    282
1    277
2    248
Name: Diagnosis, dtype: int64

In [27]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define parameter distributions for random search
param_distributions = {
    'num_leaves': [20,30,40],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

# Initialize LightGBM classifier
lgb_clf = lgb.LGBMClassifier(objective='multiclass', num_classes=3,  verbose=-1, metric='multi_logloss', boosting='dart')
# Perform random search
random_search = RandomizedSearchCV(estimator=lgb_clf, param_distributions=param_distributions,
                                   n_iter=50, cv=2, n_jobs=-1, verbose=2)
random_search.fit(X_train, y_train)
# Get the best parameters
best_params = random_search.best_params_
print("Best parameters:", best_params)

Fitting 2 folds for each of 50 candidates, totalling 100 fits


KeyboardInterrupt: 

In [None]:
# Train the model with the best parameters
best_lgb_clf = lgb.LGBMClassifier(objective='multiclass', num_classes=3,  verbose=-1, metric='multi_logloss', boosting='dart', **best_params, callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(50)])
best_lgb_clf.fit(X_train, y_train)

# Make predictions
y_pred = best_lgb_clf.predict(X_test)
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy)