# IMPORTING NECESSARY LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from sklearn.utils import resample
from scipy.stats import zscore
!pip install imblearn
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from collections import Counter

from sklearn.metrics import r2_score, roc_auc_score, roc_curve, average_precision_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, plot_confusion_matrix

from sklearn.model_selection import GridSearchCV
import statsmodels.api as sm


# 1.  IMPORTING AND WAREHOUSING DATA

In [None]:
colnames = ['P_incidence', 'P_tilt', 'L_angle', 'S_slope', 'P_radius', 'S_Degree', 'Class']
data1 = pd.read_csv("/kaggle/input/biomechanical-features-of-orthopedic-patients/column_3C_weka.csv", names = colnames, index_col = False, header = 0)
data1

# 2. DATA CLEANSING 

## A. Treating the Datatypes and correcting values wherever required:

In [None]:
data1.info()

In [None]:
print(data1['Class'].unique())

In [None]:
def classifier (x):
    if x == 'Normal':
        x = 0
        return x
    elif x =='Hernia':
        x = 1
        return x
    else:
        x=2
        return x
data1['Class'] = data1['Class'].apply(classifier)

In [None]:
print(data1['Class'].unique())

## B. Treating outliers within Dataset and replacing them with appropriate values:

In [None]:
data1.boxplot(column = ['P_incidence', 'P_tilt', 'L_angle', 'S_slope', 'P_radius', 'S_Degree'], figsize = (15,5))

In [None]:
print(data1.quantile(0.04))
print(data1.quantile(0.96))

In [None]:
data1["P_incidence"]=np.where(data1["P_incidence"]>data1["P_incidence"].quantile(0.96),data1["P_incidence"].quantile(0.96),data1['P_incidence'])
data1["P_tilt"] = np.where(data1["P_tilt"] < data1["P_tilt"].quantile(0.04),data1["P_tilt"].quantile(0.04),data1['P_tilt'])
data1["P_tilt"] = np.where(data1["P_tilt"] >data1["P_tilt"].quantile(0.96),data1["P_tilt"].quantile(0.96),data1['P_tilt'])
data1["L_angle"] = np.where(data1["L_angle"] > data1["L_angle"].quantile(0.96),data1["L_angle"].quantile(0.96),data1['L_angle'])
data1["S_slope"] = np.where(data1["S_slope"] > data1["S_slope"].quantile(0.96),data1["S_slope"].quantile(0.96),data1['S_slope'])
data1["P_radius"] = np.where(data1["P_radius"] < data1["P_radius"].quantile(0.04),data1["P_radius"].quantile(0.04),data1['P_radius'])
data1["P_radius"] = np.where(data1["P_radius"] > data1["P_radius"].quantile(0.96),data1["P_radius"].quantile(0.96),data1['P_radius'])
data1["S_Degree"] = np.where(data1["S_Degree"] > data1["S_Degree"].quantile(0.96),data1["S_Degree"].quantile(0.96),data1['S_Degree'])
data1.boxplot(column = ['P_incidence', 'P_tilt', 'L_angle', 'S_slope', 'P_radius', 'S_Degree'], figsize = (15,5))

# 3. DATA ANALYSIS AND VISUALISATION:

In [None]:
fig, ax = plt.subplots(figsize = (20,6))
ax.set_title('Class split', color = 'red')
sns.countplot(x = 'Class', data = data1)

## A.Performing detailed statistical analysis on the data

Applying stats model to find p values

In [None]:
x = data1.iloc[:,:6]
y = data1['Class']

In [None]:
x2 = sm.add_constant(x)
est = sm.OLS(y, x2)
est2 = est.fit()
print(est2.summary())

-->We find that the P_Radius and S_Degree emerge as winners for Significant Parameters for prdicting Class

Finding Pearsons CorrelationCoefficients

In [None]:
cor =data1.corr()
cor

In [None]:
f, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(cor, annot=True, cmap='cool', ax=ax)
plt.show()

-->From Heat Map we find that "S-Degree", "L - Angle" and "P-Incidence" have high correlation coefficients. 

Applying Pair Plots for Significant Variables to see whether the variables make the class apart 

## B.Multivariate, Bivariate and Univariate analysis

In [None]:
sns.pairplot (data=data1,vars = ['P_incidence','L_angle','P_radius','S_Degree'],  hue = 'Class', palette = 'bright')

The Selected Significant Variables definitely try to make the class apart atleast the Class 2 values and the same is evident from the below relation plots

In [None]:
sns.relplot(x="P_radius",y="S_Degree",col='Class', data=data1, palette = 'bright')

Normal class range of P_radius and S_Degree lies between 120 - 135 and below 10 respectievely

In [None]:
sns.relplot(x="S_Degree",y="P_incidence",col='Class', data=data1, palette = 'warm')

Normal Class Range for P incidence lies within 45 - 60 whereas the values outside this range falls under abnormal class

In [None]:
sns.relplot(x="L_angle",y="S_Degree",col='Class', data=data1, palette = 'warm')

Normal Class Range for L angle lies within 25 - 50 whereas the values outside this range falls under abnormal class

In [None]:
fig, ax = plt.subplots(1,6, figsize = (12,4))
sns.histplot(data1['P_incidence'],bins = 24,kde = True, ax = ax[0])
ax[0].set_title("DIST OF P_incidence")
sns.histplot(data1['P_tilt'],bins = 24,kde = True, ax = ax[1])
ax[1].set_title("DIST OF P_tilt")
sns.histplot(data1['L_angle'],bins = 24,kde = True, ax = ax[2])
ax[2].set_title("DIST OF L_angle")
sns.histplot(data1['S_slope'],bins = 24,kde = True, ax = ax[3])
ax[3].set_title("DIST OF S_slope")
sns.histplot(data1['P_radius'],bins = 24,kde = True, ax = ax[4])
ax[4].set_title("DIST OF P_radius")
sns.histplot(data1['S_Degree'],bins = 24,kde = True, ax = ax[5])
ax[5].set_title("DIST OF S_Degree")

plt.tight_layout()

In [None]:
data1.skew()

The values of P incidence, P radius and s slope are normally distributed.
Almost all the values are multimodal.

# 4. DATA PRE - PROCESSING:

## A. Splitting the Predicting and Target variables with normalising the data. 

In [None]:
x = data1.iloc[:,:6]
y = data1['Class']
xz = x.apply(zscore)
xztrain, xztest, ytrain, ytest = train_test_split(xz, y, test_size=0.3, random_state=20)

In [None]:
counter = Counter (ytrain)
print(counter)

We find that the class 2 is a majority class, and the other two classes are minority classes, which will be balnced by Over sampling with SMOTE Technique, Since we dont want to eliminate the target attribute by downsizing the majority class.

## B. Target Balancing and Train - Test Split of data. 

In [None]:
smote = SMOTE(random_state = 20)
xtrain1, ytrain1 = smote.fit_resample(xztrain, ytrain)
print(xtrain1.shape)
counter = Counter (ytrain1)
print (counter)

# 5. MODEL TRAINING, TESTING AND TUNING:

## A. Designing and training a KNN Claasifier - K = 10(sqrt(105))

In [None]:
modelkn = KNeighborsClassifier(n_neighbors = 10)
modelkn.fit(xtrain1, ytrain1)

## B.Displaying the Accuracies for Train and Test Data

In [None]:
print("The accuracy for train data is:", modelkn.score(xtrain1, ytrain1))
print("The accuracy for test data is:", modelkn.score(xztest, ytest))

## C. Displaying and explaining the Classification Report:

In [None]:
ypred = modelkn.predict(xztest)
print("CLASSIFICATION REPORT: \n",classification_report(ytest,ypred))
print("CONFUSION MATRIX: \n",confusion_matrix(ytest,ypred))
print("CROSS TAB: \n", pd.crosstab(ytest, ypred, rownames=['True'], colnames=['Predicted'], margins=True))
plot_confusion_matrix(modelkn,xztest,ytest)

1. The Errors values in the above model is to the value of around 20 on a overall dataset of 217 entries which work out to be around 10%.

2. The Recall values for classes 1 & 2 are above 80%, whereas the recall for class 0 is about 61%, which means that the Model is not biased on the majority class after balancing the dataset.

3. The model accuracy on the testing Dataset is above 70%.

## D. Automating the Task of finding the best K values:

In [None]:
mylist =np.arange(1,50)
trsco = []
tesco = []
bestk = []
for k in mylist:
    modelkn = KNeighborsClassifier(n_neighbors=k)
    modelkn.fit(xtrain1, ytrain1)
    ypredtr = modelkn.predict(xtrain1)
    ypredte = modelkn.predict(xztest)
    trscores = metrics.accuracy_score(ypredtr, ytrain1)
    tescores = metrics.accuracy_score(ypredte, ytest)
    trsco.append(trscores)
    tesco.append(tescores)
    if trscores>0.85:
        bestklist = [k,trscores,tescores]
        bestk.append(bestklist)
    #print('>%d,train:%0.3f,test:%0.3f' %(k,trscores,tescores))
#print(bestk)
optk = []
for x,y,z in bestk:
    k = x
    optk.append(x)
print("K values giving training scores more than 85% are:", optk)

The optimal k values for the accuracy of above 85% on the training data set are listed from the above code.

In [None]:
plt.plot(mylist,trsco,'-o', label = "Train")
plt.plot(mylist,tesco,'-o', label = "Test")
plt.legend()
plt.show()

Above plot shows that the TRaining and Test Scores converge to the same point and the following obsevations are made,

1. The training accuracy scores continue to drop from K value of 1 and converge towards testing data scores.

2. The testing accuracy scores initially elevate to a level where it stabilises and then drops significantly.

3. We select the k values in this range where it stabilises for testing data and tune our model for better results.

## E. Tuning the Paramters for best recall values:

In [None]:
import warnings
warnings.filterwarnings("ignore")
grid_params = {'n_neighbors':[5,6,7],'weights':['uniform', 'distance'],
'leaf_size':list(range(1,20)),'algorithm':['ball_tree','kd_tree','brute'],'metric':['euclidean','manhattan']}

gs = GridSearchCV(KNeighborsClassifier(), grid_params,scoring = 'recall', verbose = 1, cv = 3, n_jobs = -1)
gs_results = gs.fit(xztest, ytest)
print(gs_results.best_estimator_)
print(gs_results.best_params_)

Ignoring warning since the values are turning out to be Non - Finite for some iteration values

In [None]:
modelkn1 = KNeighborsClassifier( n_neighbors= 7, algorithm='ball_tree', leaf_size=1, metric='euclidean',weights= 'uniform')
modelkn1.fit(xtrain1, ytrain1)

Selecting the model with best parameters as above

In [None]:
print(modelkn1.score(xtrain1, ytrain1))

The Training scores have improved from 86% to above 90%.

In [None]:
ypredte = modelkn1.predict(xztest)
tescores = accuracy_score(ypredte, ytest)
tescores

The test data acuuracy scores have improved from 76% to above 83%.

## AOC FOR PREDICTING THE ABNORMALITIES:

In [None]:
yproba1 = modelkn1.predict_proba(xztest)[:,1]
yproba2 = modelkn1.predict_proba(xztest)[:,2]
yproba12 = yproba1+yproba2
ytestnew = list()
for x in ytest:
    if x == 2:
        x = 1
        ytestnew.append(x)
    else:
        x = x
        ytestnew.append(x)
fpr, tpr, thresholds = roc_curve(ytestnew, yproba12)
plt.plot([0,1],[0,1])
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=11) ROC curve')
plt.show()

from sklearn.metrics import roc_auc_score
print("AREA UNDER THE CURVE IS:",roc_auc_score(ytestnew,yproba12))

In [None]:
print("CLASSIFICATION REPORT: \n",classification_report(ytest,ypredte))
print("CONFUSION MATRIX: \n",confusion_matrix(ytest,ypredte))
print("CROSS TAB: \n", pd.crosstab(ytest, ypredte, rownames=['True'], colnames=['Predicted'], margins=True))
plot_confusion_matrix(modelkn1,xztest,ytest,cmap = 'cool')

# 6. CONCLUSION AND IMPROVISATION:

## A. CONCLUSION:

The Errors values in the above model is to the value less than 20 on a overall dataset of 217 entries which work out to be less than 10%, when compared to above 10% before tuning.

The Recall values for classes 2 is about 89% when compared 84% in the earlier model.

The model accuracy on the testing Dataset is about 82%

Hence we Conclude that the modelkn1 is the best after parameter tuning for predicting the abnormalities in the biomechanical features for classifying against Hernia and Spondolysthesis.

## B. IMPROVISATION:

1. Data Collection should have tried to achieve the target balancing initially itself.