In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
plt.style.use("seaborn") 

In [2]:
data = pd.read_csv("data/knee_features.csv")
data = data.dropna().drop_duplicates().reindex()
data["LATERALITY"] = np.where(data["LATERALITY"] == "RIGHT", 0, 1) # Use laterality as binary variables
data_x = data.drop(["KLG", "ID", "TIMEPOINT"], axis = 1)
scaler = StandardScaler()
data_x = scaler.fit_transform(data_x) # Standardizes the data on a scale of 0 to 1
data_y = data.KLG.astype(int) # Use the KLG variable as predictor

In [3]:
X_train_and_cal, X_test, y_train_and_cal, y_test = train_test_split(data_x, data_y, test_size=1/3)
X_train, X_cal, y_train, y_cal = train_test_split(X_train_and_cal, y_train_and_cal, test_size=1/3) 

#### Logistic Regression

In [4]:
LogRes = LogisticRegression(penalty="l1", multi_class="ovr", class_weight="balanced", solver="liblinear").fit(X_train, y_train)

# Training the Algorithm
predictions_LogRes = LogRes.predict(X_test)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")
print("Training Accuracy: " +str(LogRes.score(X_train,y_train)))
print("Test Accuracy: " +str(LogRes.score(X_test,y_test)))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Classification Report
classif_report = pd.concat({
    "Original dataset": pd.DataFrame(
        classification_report(y_test, predictions_LogRes, output_dict=True)
    )
}, axis=1).round(3)
print(classif_report)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Cross Validation
print("Cross Validation Scores:", cross_val_score(LogRes, data_x, data_y, cv=5))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
Training Accuracy: 0.48272161408027475
Test Accuracy: 0.47617684933466875
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
          Original dataset                                                  \
                         0         1         2         3        4 accuracy   
precision            0.554     0.291     0.370     0.385    0.424    0.476   
recall               0.762     0.022     0.322     0.488    0.653    0.476   
f1-score             0.641     0.040     0.344     0.431    0.514    0.476   
support           5623.000  2638.000  3377.000  1801.000  539.000    0.476   

                                   
           macro avg weighted avg  
precision      0.405        0.433  
recall         0.449        0.476  
f1-score       0.394        0.424  
support    13978.000    13978.000  
---------- ---------- ---------- ---------- ---------- ----------

#### Gaussian Naive Bayes

In [5]:
gnb = GaussianNB().fit(X_train, y_train)

# Training the Algorithm
predictions_gnb = gnb.predict(X_test)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")
print("Training Accuracy: " +str(gnb.score(X_train,y_train)))
print("Test Accuracy: " +str(gnb.score(X_test,y_test)))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Classification Report
classif_report = pd.concat({
    "Original dataset": pd.DataFrame(
        classification_report(y_test, predictions_gnb, output_dict=True)
    )
}, axis=1).round(3)
print(classif_report)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Cross Validation
print("Cross Validation Scores:", cross_val_score(gnb, data_x, data_y, cv=5))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
Training Accuracy: 0.4287400729770337
Test Accuracy: 0.42409500643868936
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
          Original dataset                                                  \
                         0         1         2         3        4 accuracy   
precision            0.499     0.205     0.309     0.358    0.505    0.424   
recall               0.745     0.101     0.224     0.289    0.358    0.424   
f1-score             0.598     0.135     0.260     0.320    0.419    0.424   
support           5623.000  2638.000  3377.000  1801.000  539.000    0.424   

                                   
           macro avg weighted avg  
precision      0.375        0.380  
recall         0.344        0.424  
f1-score       0.346        0.386  
support    13978.000    13978.000  
---------- ---------- ---------- ---------- ---------- ---------- 

#### K-Nearest Neighbors

In [6]:
knn = KNeighborsClassifier(n_neighbors = 4).fit(X_train, y_train)

# Training the Algorithm
predictions_knn = knn.predict(X_test)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")
print("Training Accuracy: " +str(knn.score(X_train,y_train)))
print("Test Accuracy: " +str(knn.score(X_test,y_test)))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Classification Report
classif_report = pd.concat({
    "Original dataset": pd.DataFrame(
        classification_report(y_test, predictions_knn, output_dict=True)
    )
}, axis=1).round(3)
print(classif_report)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Cross Validation
print("Cross Validation Scores:", cross_val_score(knn, data_x, data_y, cv=5))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
Training Accuracy: 0.7736102167847178
Test Accuracy: 0.6120331950207469
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
          Original dataset                                                  \
                         0         1         2         3        4 accuracy   
precision            0.630     0.542     0.601     0.626    0.684    0.612   
recall               0.869     0.433     0.443     0.479    0.310    0.612   
f1-score             0.731     0.481     0.510     0.542    0.427    0.612   
support           5623.000  2638.000  3377.000  1801.000  539.000    0.612   

                                   
           macro avg weighted avg  
precision      0.617        0.608  
recall         0.507        0.612  
f1-score       0.538        0.594  
support    13978.000    13978.000  
---------- ---------- ---------- ---------- ---------- ---------- -

#### Decision Trees

In [7]:
DecTree = DecisionTreeClassifier(max_depth = 5).fit(X_train, y_train)

# Training the Algorithm
predictions_DecTree = DecTree.predict(X_test)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")
print("Training Accuracy: " +str(DecTree.score(X_train,y_train)))
print("Test Accuracy: " +str(DecTree.score(X_test,y_test)))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Classification Report
classif_report = pd.concat({
    "Original dataset": pd.DataFrame(
        classification_report(y_test, predictions_DecTree, output_dict=True)
    )
}, axis=1).round(3)
print(classif_report)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Cross Validation
print("Cross Validation Scores:", cross_val_score(DecTree, data_x, data_y, cv=5))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
Training Accuracy: 0.4924876582957716
Test Accuracy: 0.48075547288596365
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
          Original dataset                                                  \
                         0         1         2         3        4 accuracy   
precision            0.510     0.218     0.369     0.441    0.580    0.481   
recall               0.880     0.005     0.212     0.492    0.297    0.481   
f1-score             0.646     0.009     0.269     0.465    0.393    0.481   
support           5623.000  2638.000  3377.000  1801.000  539.000    0.481   

                                   
           macro avg weighted avg  
precision      0.424        0.415  
recall         0.377        0.481  
f1-score       0.356        0.402  
support    13978.000    13978.000  
---------- ---------- ---------- ---------- ---------- ---------- 

#### Support Vector Machines

In [8]:
SVM = SVC(kernel="poly").fit(X_train, y_train)

# Training the Algorithm
predictions_svm = SVM.predict(X_test)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")
print("Training Accuracy: " +str(SVM.score(X_train,y_train)))
print("Test Accuracy: " +str(SVM.score(X_test,y_test)))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Classification Report
classif_report = pd.concat({
    "Original dataset": pd.DataFrame(
        classification_report(y_test, predictions_svm, output_dict=True)
    )
}, axis=1).round(3)
print(classif_report)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Cross Validation
print("Cross Validation Scores:", cross_val_score(SVM, data_x, data_y, cv=5))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
Training Accuracy: 0.5102489804679116
Test Accuracy: 0.4896265560165975
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
          Original dataset                                                  \
                         0         1         2         3        4 accuracy   
precision            0.486     0.568     0.432     0.546    0.706     0.49   
recall               0.960     0.009     0.184     0.353    0.302     0.49   
f1-score             0.646     0.019     0.258     0.429    0.423     0.49   
support           5623.000  2638.000  3377.000  1801.000  539.000     0.49   

                                   
           macro avg weighted avg  
precision      0.548        0.505  
recall         0.362        0.490  
f1-score       0.355        0.397  
support    13978.000    13978.000  
---------- ---------- ---------- ---------- ---------- ---------- -

#### Random Forest Classifier

In [9]:
RFC = RandomForestClassifier().fit(X_train, y_train)

# Training the Algorithm
predictions_rfc = RFC.predict(X_test)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")
print("Training Accuracy: " +str(RFC.score(X_train,y_train)))
print("Test Accuracy: " +str(RFC.score(X_test,y_test)))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Classification Report
classif_report = pd.concat({
    "Original dataset": pd.DataFrame(
        classification_report(y_test, predictions_rfc, output_dict=True)
    )
}, axis=1).round(3)
print(classif_report)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Cross Validation
print("Cross Validation Scores:", cross_val_score(RFC, data_x, data_y, cv=5))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
Training Accuracy: 1.0
Test Accuracy: 0.6207611961654028
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
          Original dataset                                                  \
                         0         1         2         3        4 accuracy   
precision            0.624     0.691     0.588     0.602    0.744    0.621   
recall               0.898     0.246     0.509     0.578    0.410    0.621   
f1-score             0.736     0.362     0.546     0.590    0.529    0.621   
support           5623.000  2638.000  3377.000  1801.000  539.000    0.621   

                                   
           macro avg weighted avg  
precision      0.650        0.630  
recall         0.528        0.621  
f1-score       0.553        0.593  
support    13978.000    13978.000  
---------- ---------- ---------- ---------- ---------- ---------- ---------- -----

#### XGBoost

In [10]:
XGBoost = GradientBoostingClassifier().fit(X_train, y_train)

# Training the Algorithm
predictions_xgb = XGBoost.predict(X_test)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")
print("Training Accuracy: " +str(XGBoost.score(X_train,y_train)))
print("Test Accuracy: " +str(XGBoost.score(X_test,y_test)))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Classification Report
classif_report = pd.concat({
    "Original dataset": pd.DataFrame(
        classification_report(y_test, predictions_xgb, output_dict=True)
    )
}, axis=1).round(3)
print(classif_report)
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

# Cross Validation
print("Cross Validation Scores:", cross_val_score(XGBoost, data_x, data_y, cv=5))
print("---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------")

---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
Training Accuracy: 0.5754453745438936
Test Accuracy: 0.5268994133638575
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
          Original dataset                                                  \
                         0         1         2         3        4 accuracy   
precision            0.553     0.405     0.436     0.530    0.626    0.527   
recall               0.883     0.029     0.355     0.503    0.404    0.527   
f1-score             0.680     0.054     0.392     0.516    0.492    0.527   
support           5623.000  2638.000  3377.000  1801.000  539.000    0.527   

                                   
           macro avg weighted avg  
precision      0.510        0.497  
recall         0.435        0.527  
f1-score       0.427        0.464  
support    13978.000    13978.000  
---------- ---------- ---------- ---------- ---------- ---------- -