The data have been organized in two different but related classification tasks.

    column3Cweka.csv (file with three class labels)
        The first task consists in classifying patients as belonging to one out of three categories: Normal (100 patients), Disk Hernia (60 patients) or Spondylolisthesis (150 patients). 

    column2Cweka.csv (file with two class labels)
        For the second task, the categories Disk Hernia and Spondylolisthesis were merged into a single category labelled as 'abnormal'. Thus, the second task consists in classifying patients as belonging to one out of two categories: Normal (100 patients) or Abnormal (210 patients).

Content

Field Descriptions:

Each patient is represented in the data set by six biomechanical attributes derived from the shape and orientation of the pelvis and lumbar spine (each one is a column):

    pelvic incidence
    pelvic tilt
    lumbar lordosis angle
    sacral slope
    pelvic radius
    grade of spondylolisthesis


In [None]:
import numpy as np 
import pandas as pd
import sklearn
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

# grid search k value for SMOTE oversampling for imbalanced classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [None]:
maindf1=pd.read_csv("../input/biomechanical-features-of-orthopedic-patients/column_3C_weka.csv")
maindf1[0:10]

In [None]:
maindf1.info()

In [None]:
maindf1['class'].value_counts()

In [None]:
maindf1.describe()

In [None]:
y=maindf1.pop('class')

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
for i in maindf1.columns:
    f, (ax_box, ax_dist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
    sns.boxplot(maindf1[i], ax=ax_box)
    sns.distplot(maindf1[i], color="g",ax=ax_dist)
    plt.show()


if you have outliers in your feature (column), normalizing your data will scale most of the data to a small interval, which means all features will have the same scale but does not handle outliers well. Standardisation is more robust to outliers, and in many cases, it is preferable over Max-Min Normalisation.

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(maindf1)
X_scaled = scaler.transform(maindf1)


In [None]:
print(X_scaled[1]) # to check the data how it looks like
print(X_scaled.shape)
# transform to dataframe
df=pd.DataFrame(data=X_scaled[0:,0:],
           index=[i for i in range(X_scaled.shape[0])],
            columns=[ i  for i in maindf1.columns ])
print('*'*80)
print(df[0:5])
print('*'*80)
df.describe()


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
for i in df.columns:
    f, (ax_box, ax_dist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
    sns.boxplot(df[i], ax=ax_box)
    sns.distplot(df[i], color="g",ax=ax_dist)
    plt.show()


In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.33, random_state=42)


Multiclass from sklearn
https://scikit-learn.org/stable/modules/multiclass.html

In [None]:

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import SGDClassifier

from sklearn.svm import LinearSVC
kernel = 1.0 * RBF(1.0)
pred1=OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train).predict(X_test)
pred2=OneVsRestClassifier(GaussianProcessClassifier(kernel=kernel,random_state=0),).fit(X_train, y_train).predict(X_test)
pred3=OneVsRestClassifier(SGDClassifier(loss="perceptron")).fit(X_train, y_train).predict(X_test)

In [None]:

print("using SVM accuracy :",accuracy_score(pred1,y_test))
print("using Gaussian Process Classifier",accuracy_score(pred2,y_test))
print("using SGD",accuracy_score(pred3,y_test))

In [None]:
from sklearn.multiclass import OneVsOneClassifier
o_pred1=OneVsOneClassifier(LinearSVC(random_state=2)).fit(X_train, y_train).predict(X_test)
o_pred2=OneVsOneClassifier(GaussianProcessClassifier(kernel=kernel,random_state=0),).fit(X_train, y_train).predict(X_test)
o_pred3=OneVsOneClassifier(SGDClassifier(loss="hinge")).fit(X_train, y_train).predict(X_test)


In [None]:
print("using SVM accuracy :",accuracy_score(o_pred1,y_test))
print("using Gaussian Process Classifier",accuracy_score(o_pred2,y_test))
print("using SGD",accuracy_score(o_pred3,y_test))


In [None]:
from sklearn.multiclass import OutputCodeClassifier
c_pred1=OutputCodeClassifier(LinearSVC(random_state=2)).fit(X_train, y_train).predict(X_test)
c_pred2=OutputCodeClassifier(GaussianProcessClassifier(kernel=kernel,random_state=0),).fit(X_train, y_train).predict(X_test)
c_pred3=OutputCodeClassifier(SGDClassifier(loss="hinge")).fit(X_train, y_train).predict(X_test)

In [None]:
print("using SVM accuracy :",accuracy_score(c_pred1,y_test))
print("using Gaussian Process Classifier",accuracy_score(c_pred2,y_test))
print("using SGD",accuracy_score(c_pred3,y_test))

In [None]:
model=OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train)
predictions=model.predict(X_test)
predictions

In [None]:
predictions.shape

In [None]:
prediction_data=pd.DataFrame(data=predictions[0:,],
           index=[i for i in range(predictions.shape[0])],
            columns=['prediction' ])


In [None]:
prediction_data.value_counts()
