# **Improving Kyphosis Diagnosis with ML/DL: Classifying Patients as Having Kyphosis or Not**

## **Problem Statement**

Kyphosis is a spinal condition that can have significant impacts on patient health.In his notebook We aim to develop a machine learning model that can accurately classify patients as having kyphosis or not based on various features.
<center>

<img src="images/Kyphosis.png" width="500"/>

</center>

## Dataset Overview

*   kyphosis dataset has 81 rows and 4 columns :

    1.   Kyphosis : Target present/absent
    2.   Age : the number of months
    3.   Number : the number of vertebrae involved
    4.   Start: the number of the first vertebra operated on.

## **Importing Libraries and Loading the dataset**

In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff

%matplotlib inline

# Utils

In [None]:
def plot_confusion_matrix_plotly(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    from sklearn.metrics import confusion_matrix
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
  
    x_labels = ['Predicted Negative', 'Predicted Positive']
    y_labels = ['Actual Negative', 'Actual Positive']
    colorscale = [[0, '#FFFFFF'], [1, '#4B0082']]

    fig = ff.create_annotated_heatmap(
        z=cm,
        x=x_labels,
        y=y_labels,
        showscale=True,
        colorscale=colorscale,
        reversescale=False,
        font_colors=['#000000', '#FFFFFF'],
    )
    # Set the title and axis labels
    fig.update_layout(
        title='Confusion Matrix : Random Forest Classifier',
        xaxis_title='Predicted Label',
        yaxis_title='True Label',
    )
    fig.show()

# Exploratory Data Analysis

In [None]:
df = pd.read_csv('kyphosis.csv')
df = df[['Age', 'Number', 'Start', 'Kyphosis']]
df.head()

In [None]:
df['Kyphosis'] = df['Kyphosis'].map({'absent':0, 'present':1})
df.head()

### Dataset description

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# check for missing values
df.isnull().sum()

### Data preprocessing

#### Visualizing Key Features in the dataset

In [None]:
# visualize the correlation between the features and the target with plotly
import plotly.express as px
fig = px.scatter_matrix(df, dimensions=['Age', 'Number', 'Start'], color='Kyphosis')
fig.show()

In [None]:
#calculate the correlation between the features
correlation = df.corr()
fig = px.imshow(correlation, text_auto=True)
fig.show()

In [None]:
fig = px.scatter_3d(df, x='Age', y='Number', z='Start', color='Kyphosis', color_continuous_scale='Viridis')
fig.show()

In [None]:
# boxplots of numerical features for outlier detection using plotly
fig = px.box(df, x='Age', color='Kyphosis')
fig.show()

fig = px.box(df, x='Number', color='Kyphosis')
fig.show()

fig = px.box(df, x='Start', color='Kyphosis')
fig.show()

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Kyphosis', axis=1)
y = df['Kyphosis']


In [None]:
x_true = X[:20]
y_true = y[:20]

In [None]:
xx = X[20:].reset_index(drop=True)
yy = y[20:].reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(xx, yy, test_size=0.3, random_state=42)

In [None]:
# use Smote to balance the dataset
from imblearn.over_sampling import SMOTE

def balance_dataset(X, y):
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    return X_res, y_res



X_res, y_res = balance_dataset(X_train, y_train)

# check the balance of the dataset
y_res.value_counts()

# Modelling 

## Lazy Predict

In [None]:
# implement lazy predict
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=1, ignore_warnings=False, custom_metric=None)
models, predictions = clf.fit(X_res, X_test, y_res, y_test)

models

In [None]:
predictions

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


rfc = RandomForestClassifier()
rfc.fit(X_res, y_res)

In [None]:
rfc_pred = rfc.predict(x_true)

print(classification_report(y_true, rfc_pred))

In [None]:
plot_confusion_matrix_plotly(y_true, rfc_pred, classes=np.array(['absent', 'present']), normalize=False,
    title='Confusion matrix, without normalization')

In [None]:
# implement the hyperparameter optimization
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 250, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}

rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rfc_random.fit(X_res, y_res)

In [None]:
# print the best parameters, the best score and the best estimator of the model after HPO
print("Best parameters : ",rfc_random.best_params_)
print("Best score : ",rfc_random.best_score_)
print("Best estimator",rfc_random.best_estimator_)

In [None]:
rfc_random_pred = rfc_random.predict(X_test)
print(classification_report(y_test, rfc_random_pred))
plot_confusion_matrix_plotly(y_true, rfc_pred, classes=np.array(['absent', 'present']), normalize=False,
    title='Confusion matrix, without normalization')

In [None]:
import plotly.figure_factory as ff
x_labels = ['Predicted Negative', 'Predicted Positive']
y_labels = ['Actual Negative', 'Actual Positive']
confusion_matrix  = [[18, 2], [1, 4]]
colorscale = [[0, '#FFFFFF'], [1, '#4B0082']]

fig = ff.create_annotated_heatmap(
    z=confusion_matrix,
    x=x_labels,
    y=y_labels,
    showscale=True,
    colorscale=colorscale,
    reversescale=False,
    font_colors=['#000000', '#FFFFFF'],
)
# Set the title and axis labels
fig.update_layout(
    title='Confusion Matrix : Random Forest Classifier',
    xaxis_title='Predicted Label',
    yaxis_title='True Label',
)
fig.show()


# XGBoost 

In [None]:
!pip install xgboost

In [None]:
# implement xgboost classifier 
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_res, y_res)

xgb_pred = xgb.predict(X_test)

print(classification_report(y_test, xgb_pred))
#print(confusion_matrix(y_test, xgb_pred))

In [None]:
import plotly.figure_factory as ff
x_labels = ['Predicted Negative', 'Predicted Positive']
y_labels = ['Actual Negative', 'Actual Positive']
confusion_matrix  = [[18, 2], [1, 4]] # XGboost Confusion Matrix
colorscale = [[0, '#FFFFFF'], [1, '#4B0082']]

fig = ff.create_annotated_heatmap(
    z=confusion_matrix,
    x=x_labels,
    y=y_labels,
    showscale=True,
    colorscale=colorscale,
    reversescale=False,
    font_colors=['#000000', '#FFFFFF'],
)
# Set the title and axis labels
fig.update_layout(
    title='Confusion Matrix : XGBoost Classifier',
    xaxis_title='Predicted Label',
    yaxis_title='True Label',
)
fig.show()
