In [1]:
import pandas as pd
import numpy as np

# Importing Dataset

In [2]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

df = pd.read_csv(url, names = columns)

In [3]:
import numpy as np

# Replace zero values with NaN in columns where zero is not a valid value
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

# Impute the missing values with the mean of the respective column
df.fillna(df.mean(), inplace=True)

# Basic Machine Learning Model



In [8]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [5]:
train_x, test_x, train_y, test_y = train_test_split(df.drop('Outcome', axis = 1), df['Outcome'], test_size = 0.2, random_state=42)

scaler = StandardScaler()
train_x_scalled = scaler.fit_transform(train_x)
test_x_scalled = scaler.transform(test_x)

# Model Evaluation

In [9]:
# Logitic Regression
model_logisticRegression = LogisticRegression()
model_logisticRegression.fit(train_x_scalled, train_y)
y_pred = q.predict(test_x_scalled)

print("Accuracy (Logistic Regression): ", cross_val_score(model_logisticRegression, train_x_scalled, train_y, cv=5, scoring='accuracy').mean())

# Random Forrest Classifier
model_RandomForrest = RandomForestClassifier()
model_RandomForrest.fit(train_x, train_y)
y_pred = model_RandomForrest.predict(test_x_scalled)

print("Accuracy (Random  Forrest): ", cross_val_score(model_RandomForrest, train_x_scalled, train_y, cv=5, scoring='accuracy').mean())

# Desicion tree Classifier
model_DecisionTree = DecisionTreeClassifier()
model_DecisionTree.fit(train_x, train_y)
y_pred = model_DecisionTree.predict(test_x_scalled)

print("Accuracy (Decision Tree): ", cross_val_score(model_DecisionTree, train_x_scalled, train_y, cv=5, scoring='accuracy').mean())

# Knearest Neighbour
model_KNN = KNeighborsClassifier()
model_KNN.fit(train_x, train_y)
y_pred = model_KNN.predict(test_x_scalled)

print("Accuracy (KNN): ", cross_val_score(model_KNN, train_x_scalled, train_y, cv=5, scoring='accuracy').mean())

Accuracy (Logistic Regression):  0.7655071304811408




Accuracy (Random  Forrest):  0.7671331467413035
Accuracy (Decision Tree):  0.6774756763961082
Accuracy (KNN):  0.744368919099027




# Hyper Parameter Tunning using Optuna

In [11]:
import optuna

In [12]:
# Defining Objective Model
def objective(trial):

  # Suggest Value for parameter
  n_estimators = trial.suggest_int('n_estimators', 100, 500)
  max_depth = trial.suggest_int('max_depth', 1, 10)
  min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
  max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

  # Create Random Forrest Classifier with Suggested Hyperparameter
  model = RandomForestClassifier(
      n_estimators=n_estimators,
      max_depth=max_depth,
      min_samples_split=min_samples_split,
      min_samples_leaf=min_samples_leaf, max_features=max_features)

  # Perform 10 Fold cross validation and calculate accuracy
  accuracy = cross_val_score(model, train_x, train_y, cv=10, scoring='accuracy').mean()

  return accuracy

In [13]:
# Create study object and optimise the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2025-01-28 17:18:47,185] A new study created in memory with name: no-name-85af51e8-5f33-4bb7-b956-dc35d42ac85f
[I 2025-01-28 17:18:54,988] Trial 0 finished with value: 0.7800634584875727 and parameters: {'n_estimators': 173, 'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 10, 'max_features': None}. Best is trial 0 with value: 0.7800634584875727.
[I 2025-01-28 17:19:03,583] Trial 1 finished with value: 0.7718931782125859 and parameters: {'n_estimators': 237, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 0 with value: 0.7800634584875727.
[I 2025-01-28 17:19:08,276] Trial 2 finished with value: 0.765309360126917 and parameters: {'n_estimators': 170, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 8, 'max_features': None}. Best is trial 0 with value: 0.7800634584875727.
[I 2025-01-28 17:19:14,670] Trial 3 finished with value: 0.7622686409307244 and parameters: {'n_estimators': 278, 'max_depth': 6, 'min_sam

In [20]:
# Print the best result
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best train Accuracy: ', study.best_trial.value)

# print the parameter which is used to get the best values
print(study.best_params)

Number of finished trials: 100
Best trial: {'n_estimators': 128, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 9, 'max_features': None}
Best train Accuracy:  0.783262823902697
{'n_estimators': 128, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 9, 'max_features': None}


# Training with best HyperParameter Found using Optuna

In [19]:

from sklearn.metrics import accuracy_score

# Train a random forrest classifier with best hyperparameter from optuna
model = RandomForestClassifier(**study.best_trial.params)
model.fit(train_x, train_y)
y_pred = model.predict(test_x)
print(f"Accuracy: {accuracy_score(test_y, y_pred):.2f}")

Accuracy: 0.75


# Different Optimiser

In [None]:
# Default optimiser is TPE sampler
study_2 = optuna.create_study(direction='maximize', sampler = optuna.samplers.RandomSampler())
# another study sampler could be GridSampler
study_2.optimize(objective, n_trials=20)

# Visualising Result

In [23]:
!pip install -U optuna plotly



In [25]:
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_contour, plot_slice, plot_edf

In [26]:
plot_optimization_history(study)

In [27]:
plot_param_importances(study)

In [None]:
plot_contour(study)

In [28]:
plot_slice(study)

In [29]:
plot_edf(study)

# Best of Optuna

In [42]:
def objecttive(trial):
  classifier_name = trial.suggest_categorical('classifier', ['RandomForest', 'LogisticRegression', 'KNN'])

  if classifier_name == 'RandomForest':
    # Random Forrest Parameter
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 1, 10)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth)
  elif classifier_name == 'LogisticRegression':
    # Logistic Regression Parameter
    penalty = trial.suggest_categorical('penalty', ['l2'])
    C = trial.suggest_float('C', 0.001, 10)

    model = LogisticRegression(
        penalty=penalty,
        C=C)
  elif classifier_name == 'KNN':
    # KNN Parameter
    n_neighbors = trial.suggest_int('n_neighbors', 1, 10)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    p = trial.suggest_int('p', 1, 2)

    model = KNeighborsClassifier(
        n_neighbors=n_neighbors,
        weights=weights,
        p=p)

  accuracy = cross_val_score(model, train_x, train_y, cv=10, scoring='accuracy').mean()

  return accuracy


In [43]:
# Create study object and Optimise it
study = optuna.create_study(direction='maximize')
study.optimize(objecttive, n_trials=20)

[I 2025-01-28 17:53:04,433] A new study created in memory with name: no-name-838de097-6d05-45c9-978b-0e90e9933252
[I 2025-01-28 17:53:11,554] Trial 0 finished with value: 0.6938392384981491 and parameters: {'classifier': 'RandomForest', 'n_estimators': 340, 'max_depth': 1}. Best is trial 0 with value: 0.6938392384981491.
[I 2025-01-28 17:53:13,489] Trial 1 finished with value: 0.7606028556319407 and parameters: {'classifier': 'RandomForest', 'n_estimators': 113, 'max_depth': 4}. Best is trial 1 with value: 0.7606028556319407.
[I 2025-01-28 17:53:13,566] Trial 2 finished with value: 0.713379164463247 and parameters: {'classifier': 'KNN', 'n_neighbors': 6, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 0.7606028556319407.

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documenta

In [44]:
# Find the best parameter
print(f"Best trial: {study.best_trial.params}")
print(f"Best accuracy: {study.best_trial.value}")

Best trial: {'classifier': 'LogisticRegression', 'penalty': 'l2', 'C': 3.3158561172748167}
Best accuracy: 0.7702802749867794


# Visualising Dataframe Formate

In [48]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_classifier,params_max_depth,params_n_estimators,params_n_neighbors,params_p,params_penalty,params_weights,state
0,0,0.693839,2025-01-28 17:53:04.439921,2025-01-28 17:53:11.554231,0 days 00:00:07.114310,,RandomForest,1.0,340.0,,,,,COMPLETE
1,1,0.760603,2025-01-28 17:53:11.557720,2025-01-28 17:53:13.489186,0 days 00:00:01.931466,,RandomForest,4.0,113.0,,,,,COMPLETE
2,2,0.713379,2025-01-28 17:53:13.493290,2025-01-28 17:53:13.566559,0 days 00:00:00.073269,,KNN,,,6.0,2.0,,distance,COMPLETE
3,3,0.77028,2025-01-28 17:53:13.568092,2025-01-28 17:53:13.797478,0 days 00:00:00.229386,3.315856,LogisticRegression,,,,,l2,,COMPLETE
4,4,0.760603,2025-01-28 17:53:13.799033,2025-01-28 17:53:18.675363,0 days 00:00:04.876330,,RandomForest,5.0,235.0,,,,,COMPLETE
5,5,0.739582,2025-01-28 17:53:18.681428,2025-01-28 17:53:18.831899,0 days 00:00:00.150471,,KNN,,,9.0,2.0,,uniform,COMPLETE
6,6,0.739609,2025-01-28 17:53:18.836669,2025-01-28 17:53:18.924582,0 days 00:00:00.087913,,KNN,,,9.0,2.0,,distance,COMPLETE
7,7,0.739609,2025-01-28 17:53:18.928332,2025-01-28 17:53:19.022604,0 days 00:00:00.094272,,KNN,,,9.0,2.0,,distance,COMPLETE
8,8,0.718218,2025-01-28 17:53:19.025203,2025-01-28 17:53:19.116429,0 days 00:00:00.091226,,KNN,,,3.0,1.0,,distance,COMPLETE
9,9,0.731491,2025-01-28 17:53:19.121715,2025-01-28 17:53:19.258645,0 days 00:00:00.136930,,KNN,,,10.0,2.0,,uniform,COMPLETE
