In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading the required libraries in one place
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(color_codes=True) # adds a nice background to the graphs
%matplotlib inline
from scipy.stats import ttest_ind, levene, shapiro
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import precision_recall_curve

In [None]:
# Loading the 2C_Weka file with normal and abnormal classification into a dataset
df_master = pd.read_csv("/kaggle/input/biomechanical-features-of-orthopedic-patients/column_2C_weka.csv", encoding="utf-8", engine='c')

# Displaying common stats of all columns
df_master.describe(include="all")

# **Data Cleaning**

In [None]:
# Displaying master dataset shape and size
print("Shape of dataset: ", df_master.shape)
print("Size of dataset: ", df_master.size)

In [None]:
# Displaying common stats of all columns
df_master.describe(include="all")

In [None]:
# Checking attribute information to verify if there are any Null values
df_master.info()

<font color=#B40404>**No Null Values found and target variable class has 2 unique values. Let's explore the dataset more to see what else we can find.**

# **Exploratory Data Analysis (EDA)**

**<font color=#B40404>First let us look at correlation between all independent variables**

In [None]:
# Displaying the correlation map
f, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(df_master.corr(), annot=True, cmap='Reds', ax=ax)
plt.show()

In [None]:
# Displaying all correlations with ascending sort.
df_master.corr().unstack().sort_values().drop_duplicates()

**<font color=#B40404>Any values greater and 0.5 or less than -0.5 is a highly correlated pair. Some examples are all Pelvic_incidence to any other attribute pair, except for Pelvic_radius. Pelvic_radius is not correlated to any other attribute except itself. Sacral_Slope is correlated with all attributes other than Pelvic_Radius and Pelvic_tilt. Other than the above mentioned pairs Lumbar_lordosis_angle is highly correlated to Sacral_slope.**

**<font color=#B40404>Before jumping into the test let us set our Null Hypothesis and Alternate Hypothesis.**

**Ho is μ(Normal) = μ(Abnormal)**
    
**Ha is μ(Normal) ≠ μ(Abnormal)**

In [None]:
# Creating a function which will return required outputs after performing welch_ttest
def welch_ttest(x, y):
    ## Welch-Satterthwaite Degrees of Freedom ##
    dof = (x.var()/x.size + y.var()/y.size)**2 / ((x.var()/x.size)**2 / (x.size-1) + (y.var()/y.size)**2 / (y.size-1))
   
    # Welch's Test
    t, p = ttest_ind(x, y, equal_var = False)
    
    # Displaying the results found
    print("\n",
          f"Welch's t-test= {t:.4f}", "\n",
          f"p-value = {p:.4f}", "\n",
          f"Welch-Satterthwaite Degrees of Freedom= {dof:.4f}")

In [None]:
# Performing Welch's t test on normal and abnormal groups for all independent variables
# Running a for loop to extract each attribute name individually
for col in df_master.columns[:-1]:
    # Creating a 2 groups based on Dependent variable labels Normal and Abnormal (Type_H and Type_S)
    group1 = df_master[col][df_master["class"]=="Normal"]
    group2 = df_master[col][df_master["class"]!="Normal"]
    
    # Printing newline for cosmetic purposes
    print("\n", col)
    
    # If Shapiro test clears both groups (Confidence of 95%) then perform welch test else display appropriate message
    if (shapiro(group1)[1]<0.05) and (shapiro(group2)[1]<0.05):
        welch_ttest(group1, group2)
    else:
        print("\n At least one of the classes (Normal and Abnormal) is not normally distributed")

**<font color=#B40404>We can see for Pelvic_incidence, Lumbar_lordosis_angle and Degree_spondylolisthesis we should reject our Null Hypothesis and hence we can safely assume that the means of the values of these attibutes, when seperated based on the labels Normal and Abnormal of the dependent variables class, are different.**
    
**<font color=#B40404>Thus these attributes will do a good job of classifying whether a Class is Normal or Abnormal.**

**<font color=#B40404>Now let us apply Tukeys hsd test to compare the means between all labels of Class. Normal vs Abnormal**
    
**<font color=#B40404>Ho is μ(Normal) = μ(Abnormal)**
**<font color=#B40404>Ha is μ(Normal) ≠ μ(Abnormal)**

**<font color=#B40404>Confidence of 95% or alpha as 0.05**

In [None]:
# Performing Tukey's hsd test on different groups, based on Class variable, for all independent variables
# Running a for loop to extract each attribute name individually
for col in df_master.columns[:-1]:
    # Printing newline for cosmetic purposes
    print("\n", col, "\n")
    # Display Tuhey hsd test results
    print(pairwise_tukeyhsd(df_master[col], df_master["class"], alpha=0.05))

**<font color=#B40404>Let's look at a pairplot first as it covers a multivariate analysis where the diagonal is an univariate analysis and the rest is bivariate analysis.**

In [None]:
# Displaying a multivariate analysis
sns.pairplot(data=df_master, hue="class", palette="bright")

**<font color=#B40404>Now let's visualise if we can separate Normal and Abnormal through bivariate analysis.**

In [None]:
df_normal = df_master[df_master["class"]=="Normal"]
df_abnormal = df_master[df_master["class"]!="Normal"]
print("Normal")
df_normal.info()
print()
print('Abnormal')
df_abnormal.info()

In [None]:
# Displaying boxplots for distinguishing between the classes
g = sns.catplot(data=df_master, col="class", kind="box")
g.set_xticklabels(rotation=45)
plt.ylim(-50, 175)

plt.show()

In [None]:
# Displaying the scatterplots for all combination of attributes
for y in range(len(df_master.columns[:-1])):
    for x in range(len(df_master.columns[:-1])):
        if x>y:
            g = sns.FacetGrid(df_master, col="class")
            g.map(sns.scatterplot, df_master.columns[:-1][x], df_master.columns[:-1][y])
            # Using log scale to exxagarate the differences
#             g.set(xscale="log")
#             g.set(yscale="log")
            plt.show()

# **Data pre-processing**

In [None]:
# Seperating Predictor variables, we know the target variable "Class" is the last column
X = df_master.iloc[:, :-1]

# Seperating Target variables, we know the target variable is called "Class"
y = df_master["class"]

# Scaling the data so to give equal importance to all attributes
X = (X - np.min(X))/(np.max(X) - np.min(X))
X.describe()

In [None]:
# Encoding Target variables
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

# Displaying the order of the classes
le.classes_

In [None]:
# Performing the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Verifying the proportions of y_train and y_test is the same as original dataset
print("y_train proportions")
print(np.unique(y_train, return_counts=True)[1]/len(y_train))

print("y_test proportions")
print(np.unique(y_test, return_counts=True)[1]/len(y_test))

**<font color=#B40404>Train and Test set is ready**

# **Grid Search**

In [None]:
# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'p': [1, 2],
              'n_neighbors': range(3, 51)}

# instantiate the model
knn = KNeighborsClassifier(metric="minkowski")

# fit tree on training data
knn_grid = GridSearchCV(knn, parameters, 
                    cv=n_folds, 
                   scoring='precision')
knn_grid.fit(X_train, y_train)

###### scores of GridSearch CV
scores = pd.DataFrame(knn_grid.cv_results_)
scores

In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',knn_grid.best_score_,'using',knn_grid.best_params_)

# **Final Model**

In [None]:
# Create a function which will plot the ROC curve when false positive and true positive rates are fed to it
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [None]:
# Building the Final Model
classifier = KNeighborsClassifier(n_neighbors=4, metric="minkowski", p=1)
classifier.fit(X_train, y_train)

# Predicting the results against the test set
y_pred = classifier.predict(X_test)

# Building a confusion matrix for evaluation
df_cm = pd.DataFrame(confusion_matrix(y_test, y_pred), 
             index=["T_Abnormal", "T_Normal"], columns=["P_Abnormal", "P_Normal"])
                     
df_cm

<font color=#B40404>**Let's evaluate our final model and see if we can improve the cutoff**

In [None]:
# Calculating Precision, recall and F1 Score
precision = df_cm.iloc[1, 1]/sum(df_cm.iloc[:, 1])
recall = df_cm.iloc[1, 1]/sum(df_cm.iloc[1, :])
f_score = 2 * (precision * recall)/(precision+recall)

print("Precision: {}\nRecall: {}\nF Score: {}".format(precision, recall, f_score))

In [None]:
# Loading the probability values for positive class
y_prob = classifier.predict_proba(X_test)[:,1]

# Calculating False positive rate, true positive rate and the threshold values
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob, pos_label=1)

# Plotting the ROC curve
plot_roc_curve(fpr, tpr)

# Calculating the area under the curve
print("Area under the curve: ", metrics.auc(fpr, tpr))

In [None]:
# Calculating the area under the curve
print("Area under the curve: ", metrics.auc(fpr, tpr))

# Calculating the Precision, Recall and the threshold values
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

# Calculate the f score for all threshold values
fscore = (2 * precision * recall) / (precision + recall)

# Locate the index of the largest f score
ix = np.argmax(fscore)

# Display the Best cutoff point based on the best f score
print('Best Threshold=%f, F-Score=%.3f' %(thresholds[ix], fscore[ix]))

# Plot the precision-recall curve for the model
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
plt.plot(recall, precision, marker='.', label='Logistic')
plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
# show the plot
plt.show()

In [None]:
# Create and display the dataframe of all metric values
df_scores_knn = pd.DataFrame({'precision' : pd.Series(precision),
                    'recall' : pd.Series(recall), 
                    'fscore' : pd.Series(fscore), 
                    'thresholds' : pd.Series(thresholds)})
df_scores_knn

In [None]:
# Displaying the resulting confusion Matrix at best precision to recall cutoff

threshold = thresholds[ix]
df_cm = pd.DataFrame(confusion_matrix(y_test, list(map(lambda x: 1 if x >= threshold else 0, y_prob))), 
             index=["T_Depositor", "T_Asset"], columns=["P_Depositor", "P_Asset"])
df_cm