In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import scikitplot as skplt

nuclei_m1 = pd.read_csv("nuclei_M1_NucleiObj.csv")
nuclei_m2_nt = pd.read_csv("nuclei_M2_NT_NucleiObj.csv")

In [11]:
nuclei_m1['CellType'] = 'M1'
nuclei_m2_nt.loc[nuclei_m2_nt['ImageNumber'].isin([1,2,3]), 'CellType'] = 'M2'
nuclei_m2_nt.loc[nuclei_m2_nt['ImageNumber'].isin([4,5,6]), 'CellType'] = 'NT'

nucleus_dataset = nuclei_m1.merge(nuclei_m2_nt, how='outer')
nucleus_dataset.drop(['ImageNumber', 'ObjectNumber'], axis=1, inplace=True)

X_nucleus = nucleus_dataset.drop('CellType', axis=1)
y_nucleus = nucleus_dataset['CellType']

In [12]:
# Hyperparameter grid for each model
param_knn = { 'n_neighbors' : [5,7,9,11,13,15],
            'weights' : ['uniform','distance'],
            'metric' : ['minkowski','euclidean','manhattan']}
param_lreg = {'C' : np.logspace(-4, 4, 50),
            'penalty' : ['l1', 'l2'],
             'max_iter' : [15000]}
param_svm = {'C': [0.1, 1, 10, 100, 1000], 
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf']} 
param_rf = {'max_depth':[3,5,10,None],
            'n_estimators':[10,100,200],
            'max_features':[1,3,5,7],
            'min_samples_leaf':[1,2,3],
            'min_samples_split':[1,2,3],
            'warm_start' : [True]
           }
param_gbr = {"learning_rate": [0.01, 0.05,0.1,0.2],
            "min_samples_split": np.linspace(0.1, 0.5, 4),
            "min_samples_leaf": np.linspace(0.1, 0.5, 4),
            "max_depth":[3,5,8],
            "max_features":["log2","sqrt"],
            "criterion": ["friedman_mse",  "mae"],
            "subsample":[0.5, 0.8, 1.0]
            }

# Models and corresponding hyperparameter grids
models = {"KNN": [KNeighborsClassifier(), param_knn],
          "Logistic Regression": [LogisticRegression(), param_lreg],
          "RandomForestClassifier": [RandomForestClassifier(), param_rf],
          "SVM": [SVC(), param_svm],
          "XGB": [GradientBoostingClassifier(), param_gbr]
         }

In [13]:
scaler = StandardScaler()
X_nucleus_scaled = scaler.fit_transform(X_nucleus)

kf = KFold(n_splits=5, shuffle=True, random_state=666)
scores = []

# Run the grid search for each pipeline and print the best parameters and score
for model_param in models.values():
    model_grid = GridSearchCV(estimator=model_param[0], param_grid=model_param[1], cv=kf, scoring='accuracy')
    model_grid.fit(X_nucleus_scaled, y_nucleus)
    best_params = grid.best_params_
    best_score = grid.best_score_
    scores.append((model_param[0], best_params, best_score))
    print(model_param[0], "highest score is", best_score)

models_scores = pd.DataFrame(scores, columns=['Model', 'Best_Parameters', 'Best_Score'])
models_scores.sort_values(by='Best_Score', ascending=False, ignore_index=True, inplace=True)
models_scores

Traceback (most recent call last):
  File "/home/ka-and/Desktop/Jupyter/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/ka-and/Desktop/Jupyter/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/ka-and/Desktop/Jupyter/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/ka-and/Desktop/Jupyter/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/home/ka-and/Desktop/Jupyter/lib/python3.10/site-packages/sklearn/utils/_response.py", line 85, in _get_response_values
    y_pred = prediction_method(X)
  File "/home/ka-and/Desktop/Jupyter/lib/python3.10/site-packages/sklearn/neighbors/_classification.py

KNeighborsClassifier() highest score is 0.9127315879423806


KeyboardInterrupt: 

In [None]:
# retrieve the best model
best_params = models_scores.loc[0, 'Best_Parameters']
best_model_name = models_scores.loc[0, 'Model']
best_model = models[best_model_name][0]
best_model.set_params(**best_params)

X_train, X_test, y_train, y_test = train_test_split(X_nucleus_scaled, y_nucleus, test_size = 0.25, random_state=0)

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

skplt.metrics.plot_confusion_matrix(y_test, y_pred, normalize=False, title = 'Confusion Matrix')

In [None]:
kf = KFold(n_splits=20, shuffle=True, random_state=666)
cv_scores = cross_val_score(best_model, X_train, y_train, cv=kf)
plt.boxplot(cv_scores)
plt.show()