Classification of Vertebral Column Data

In [None]:
# Import libraries
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [None]:
# Read in the data

df = pd.read_csv('vertebral_column_data.txt', sep=" ", header=None)


# Rename columns
df = df.rename(columns={0: 'pelvic_incidence', 1: 'pelvic_tilt', 2: 'lumbar_lordosis', 
                        3: 'sacral_slope', 4: 'pelvic_radius', 5: 'spondylolisthesis',
                        6: 'category'})

# Change category column to 0 and 1
df = df.replace({'category': {'NO': 0, 'AB': 1}})

In [None]:
df.head()

In [None]:
# Unsupervised classification: K-Means Clustering

In [None]:
# Get all the features columns except the category column
df_2 = df[["pelvic_incidence", "pelvic_tilt", "lumbar_lordosis", 
           "sacral_slope", "pelvic_radius", "spondylolisthesis"]]

# define standard scaler
scaler = StandardScaler()
  
# transform data
df_2 = scaler.fit_transform(df_2)

# Make sure data is a pandas dataframe
df_2 = pd.DataFrame(df_2)

In [None]:
df_2.head()

In [None]:
# Checking data is standardised. Mean = 0 and SD = 1. 
print(df_2.describe())

In [None]:
# Putting dataframe into numpy array
array = df_2.values
xMat = np.array(array)
xDim = xMat.shape[0]
xMat[0:5,:]

In [None]:
# Creating elbow plot by calculating sum-of-squares for each value of K
kMin = 1
kMax = 8
repsPerK = 10
kValues = np.zeros( kMax - kMin + 1)
sumOfSquares = np.zeros( kMax - kMin + 1)
for k in range(kMin, kMax+1):
    kValues[k - kMin] = k
    kmeansResult = KMeans( k, n_init=repsPerK ).fit( xMat )
    sumOfSquares[k - kMin] = kmeansResult.inertia_

In [None]:
# Plotting elbow plot for K-Means Clustering
elbowFig = plt.figure(figsize=[10,6])
plt.plot( kValues, sumOfSquares )
plt.scatter( x=kValues, y=sumOfSquares )
plt.xlabel('k: number of clusters', fontsize=14)
plt.ylabel('Sum of squares', fontsize=14)
plt.show()

In [None]:
# K-Means Clustering with K = 2
clustering_kmeans = KMeans(n_clusters=2)

# Saving each datapoints cluster in new column
df_2['clusters'] = clustering_kmeans.fit_predict(df_2)

In [None]:
# Using PCA to reduce dimensions of data to allow for visualisation
pca_num_components = 2

reduced_data = PCA(n_components=pca_num_components).fit_transform(df_2)
results = pd.DataFrame(reduced_data,columns=['pca1','pca2'])

sns.scatterplot(x="pca1", y="pca2", hue=df_2['clusters'], data=results)
plt.show()

In [None]:
# K-Means Clustering with K = 3

clustering_kmeans = KMeans(n_clusters=3)
df_2['clusters_n3'] = clustering_kmeans.fit_predict(df_2)

In [None]:
# Using PCA to reduce dimensions of data to allow for visualisation

pca_num_components = 2

reduced_data = PCA(n_components=pca_num_components).fit_transform(df_2)
results = pd.DataFrame(reduced_data,columns=['pca1','pca2'])

sns.scatterplot(x="pca1", y="pca2", hue=df_2['clusters_n3'], data=results)
plt.show()

In [None]:
# Supervised classification: K-Nearest Neighbours

In [None]:
df_3 = df_2[[0,1,2,3,4,5]]

In [None]:
display(df_3)

In [None]:
from sklearn.model_selection import train_test_split

X = df_3
y = df["category"]

# Split dataset into 70% training set and 30% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 

In [None]:
# For loop to find best value for K

error = []

for k in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    error.append(np.mean(y_pred != y_test))

In [None]:
# Plotting the error rates for different values of K

plt.figure(figsize=(10,6))
plt.plot(range(1,40),error,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
#Create KNN Classifier with K = 3
knn = KNeighborsClassifier(n_neighbors=13)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))