In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plot
from scipy.stats import norm

from sklearn.preprocessing import StandardScaler as Zscore
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as DataSplit
from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC as SVM
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

from keras.utils import np_utils
from scipy import stats


import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Data load

In [None]:
# 2 classes
data = pd.read_csv('../input/biomechanical-features-of-orthopedic-patients/column_2C_weka.csv')
data.head()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data["class"].value_counts() # Imbalanced dataset

# Data Encoding

In [None]:
encoder = LabelEncoder()
Y = data['class']
encoded_Y = encoder.fit_transform(Y)

y = np_utils.to_categorical(encoded_Y)
y = pd.DataFrame(data=y, columns=['Abnormal','Normal'])

y1 = y.head(3)
y2 = y.tail(3)
conc_y = pd.concat([y1, y2], axis=0, ignore_index=True)
conc_y

# Data split

In [None]:
X = data
X_train, X_test, y_train, y_test = DataSplit(X,y,test_size=0.33)

Trainset = pd.concat([X_train, y_train], axis=1)
Testset = pd.concat([X_test, y_test], axis=1)

In [None]:
human = Trainset.drop(['Abnormal','Normal'], axis=1)
X_train = X_train.drop(['class'], axis=1)
X_test = X_test.drop(['class'], axis=1)

In [None]:
human.head(10)

In [None]:
X_test.to_csv('testset.csv',index=False)
X_test.head(10)

# 2. Data analysis
This section analyze the training dataset. Testset should be unseen data

In [None]:
human["pelvic_incidence"].describe()

In [None]:
# Imbalanced dataset
sns.countplot(x="class", data=human)
human.loc[:,'class'].value_counts()

In [None]:
human.columns

# (1) Boxplot (instead of Histogram)

In [None]:
plot.figure(figsize=(10,10))

melted_human = pd.melt(human, id_vars = "class", value_vars = list(human.columns[0:6]))
sns.boxplot(x = "variable", y="value", hue = "class", data = melted_human)
plot.grid()
plot.xticks(rotation=45)
plot.show()

# (2) Pair plot

In [None]:
# sns.pairplot(iris.drop("Id", axis=1), hue = "Species", size = 3) # Simple version

sns.set(style = "white")
g = sns.PairGrid(human,diag_sharey = False, hue="class", data=human)
g.map_lower(sns.kdeplot, camp="Blues_d")
g.map_upper(plot.scatter)
g.map_diag(sns.kdeplot, lw = 3)
plot.savefig('Pairplot.png')
plot.legend(['Norman','Abnormal'])
plot.show()

# (3) Pearson's correlation

In [None]:
f, ax = plot.subplots(figsize = (5,5))
sns.heatmap(human.corr(), annot = True, linewidth = 0.5, fmt = ".1f", ax = ax)
plot.xticks(rotation = 90)
plot.title('correlation map')
plot.show()

# (4) Spearman's rank correlation

In [None]:
ranked_data = human.rank()
f, ax = plot.subplots(figsize = (5,5))
sns.heatmap(ranked_data.corr(), annot = True, linewidth = 0.5, fmt = ".1f", ax = ax)
plot.xticks(rotation = 90)
plot.yticks(rotation = 1)
plot.title('corrleation Map')
plot.savefig('heatmap_Spearman.png')
plot.show()

# (5) Missing data

In [None]:
total = human.isnull().sum().sort_values(ascending=False)
percent = (human.isnull().sum()/human.isnull().count()).sort_values(ascending=False)

missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'])
missing_data.head()

# 3. Train the models

In [None]:
X_train_d = Zscore().fit_transform(X_train)
X_test_d = Zscore().fit_transform(X_test)
    
def train_and_test(model):
    model = OneVsRestClassifier(model).fit(X_train_d,y_train)
    prediction = model.predict(X_test_d)
    accuracy = round(accuracy_score(prediction,y_test)*100, 2) # .2f
    print("Model:",model," Training_accuracy: ", accuracy, "%")
    return accuracy, prediction, model


# Logistic regression
LR_pred = train_and_test(LR())

# SVM
SVM_acc, SVM_pred, SVM_model = train_and_test(SVM(kernel='rbf'))

# KNN
KNN_pred = train_and_test(KNN(n_neighbors = 10))

# Random forest
RF_pred = train_and_test(RF(n_estimators=200))

# Naive Bayes
NB_pred = train_and_test(NB())

print("SVM shows the highest accuracy in this example")

In [None]:
y_test_int = np.argmax(np.array(y_test), axis=1)
SVM_pred_int = np.argmax(SVM_pred, axis=1)
y_test.value_counts()

In [None]:
cm = confusion_matrix(y_test_int, SVM_pred_int)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plot.title("Confusion matrix (number)")