In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Explore

In [None]:
df = pd.read_csv("../input/biomechanical-features-of-orthopedic-patients/column_2C_weka.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
plt.figure(figsize=[10,10])
plt.scatter(df.pelvic_incidence,df.sacral_slope)

plt.xlabel("Pelvic Incidence")
plt.ylabel("Sacral Slope")

plt.show()

In [None]:
from sklearn.linear_model import LinearRegression


linear_reg=LinearRegression()


x=df.pelvic_incidence.values.reshape(-1,1)
y=df.sacral_slope.values.reshape(-1,1)
linear_reg.fit(x,y)
y_head=linear_reg.predict(x)

In [None]:
plt.figure(figsize=[10,10])
plt.scatter(x,y)

plt.xlabel("Pelvic Incidence")
plt.ylabel("Sacral Slope")

plt.plot(x,y_head,color="red")
plt.show()

In [None]:
sns.pairplot(df)

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr())

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Determine categorical features

numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
features = df.columns.values.tolist()
for col in features:
    if df[col].dtype in numerics: continue
    categorical_columns.append(col)
categorical_columns

In [None]:
# Encode categorical features

for col in categorical_columns:
    if col in df.columns:
        label = LabelEncoder()
        label.fit(list(df[col].astype(str).values))
        df[col] = label.transform(list(df[col].astype(str).values))

In [None]:
target_name = 'class'
data_target = df[target_name]
df = df.drop([target_name], axis=1)

# Test & Train Sets

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df, data_target, test_size=0.3, random_state=1)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression

logreg = LogisticRegression()
logreg.fit(x_train, y_train)
acc_log = round(logreg.score(x_train, y_train) * 100, 2)
acc_log

In [None]:
acc_test_log = round(logreg.score(x_test, y_test) * 100, 10)
print("Accuracy: ",acc_test_log)

# Support Vector Machines

In [None]:
from sklearn.svm import SVC

# Support Vector Machines

svc = SVC()
svc.fit(x_train, y_train)
acc_svc = round(svc.score(x_train, y_train) * 100, 2)
acc_svc

In [None]:
acc_test_svc = round(svc.score(x_test, y_test) * 100, 2)
acc_test_svc

# K-Nearest Neighbors with Evaluation

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(x_train, y_train)
print(" {} nn score: {} ".format(5,knn.score(x_test,y_test)))

In [None]:
score_list = []
for i in range(1,50):
    knn2 = KNeighborsClassifier(n_neighbors = i)
    knn2.fit(x_train, y_train)
    score_list.append(knn2.score(x_test, y_test))
score_list

In [None]:
plt.plot(range(1,50),score_list)
plt.xlabel("k values")
plt.ylabel("Accuracy")
plt.show()

In [None]:
knn3 = KNeighborsClassifier(n_neighbors = 20)
knn3.fit(x_train, y_train)
print(" {} nn score: {} ".format(20,knn3.score(x_test,y_test)))

In [None]:
y_pred_knn = knn3.predict(x_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_knn)
print("KNN Confusion Matrix Result: \n \n", cm)

In [None]:
# Confusion matrix
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.xlabel("Prediction")
plt.ylabel("Real")
plt.show()

# Decision Tree Classifier

In [None]:
# Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
acc_decision_tree = round(decision_tree.score(x_train, y_train) * 100, 2)
acc_decision_tree

In [None]:
acc_test_decision_tree = round(decision_tree.score(x_test, y_test) * 100, 2)
acc_test_decision_tree

# Random Forests

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

random_forest = GridSearchCV(estimator=RandomForestClassifier(), param_grid={'n_estimators': [100, 300]}, cv=5).fit(x_train, y_train)
random_forest.fit(x_train, y_train)
acc_random_forest = round(random_forest.score(x_train, y_train) * 100, 2)
print(acc_random_forest,random_forest.best_params_)

In [None]:
acc_test_random_forest = round(random_forest.score(x_test, y_test) * 100, 3)
acc_test_random_forest

# Neural Networks

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
# Model
model = Sequential()
model.add(Dense(16, input_dim = x_train.shape[1], activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
hist = model.fit(x_train, y_train, batch_size=64, 
               epochs=500, verbose=1)

In [None]:
plt.plot(hist.history['accuracy'], label='acc')

plt.ylim((0, 1))
plt.legend()

In [None]:
from sklearn import metrics

# Predicting the Train set results
nn_prediction = model.predict(x_train)
nn_prediction = (nn_prediction > 0.5)*1 # convert probabilities to binary output

# Compute error between predicted data & true answer
acc_ann2 = round(metrics.accuracy_score(y_train, nn_prediction) * 100, 3)
acc_ann2

In [None]:
# Predicting the Test set results
nn_prediction_test = model.predict(x_test)
nn_prediction_test = (nn_prediction_test > 0.5)*1 # convert probabilities to binary output

# Compute error between predicted data & true answer
acc_test_ann2 = round(metrics.accuracy_score(y_test, nn_prediction_test) * 100, 10)
acc_test_ann2

# Conclusion

* Decision Tree Classifier: 77.4%
* Random Forests Classifier: 82.8%
* Support Vector Machine: 84.9%
----
* K-Nearest Neighbors: 87.09%
* Neural Networks: 87.09%
* Logistic Regression: 87.09%