# Lower Back Pain Symptoms Dataset

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn import svm 

In [None]:
data =pd.read_csv("../input/Dataset_spine.csv")

In [None]:
data.head()

Last column should be description of data columns, can be deleted 

In [None]:
del data['Unnamed: 13']

In [None]:
data.info()

Except from class column, all variables are in numeric

In [None]:
data.describe()

In [None]:
data.isnull().any()

There is no missing values 

In [None]:
data.shape

There are 310 records and 13 variables 

In [None]:
correlation = data.corr()
plt.figure(figsize=(15,10))
sns.heatmap(correlation,annot=True)
plt.show()

In [None]:
data[data['Class_att']=='Abnormal'].shape[0]

In [None]:
data[data['Class_att']=='Normal'].shape[0]

There are 210 abnormal records and 100 normal records. Not too balanced 

In [None]:
plt.figure(figsize=(15,10))
data.boxplot(patch_artist=True)
plt.show()

Seems there is an outliner in Col6. To prevent distortion, this record will be removed

In [None]:
data.drop(data[data['Col6']>400].index,inplace=True)

In [None]:
data.reset_index(inplace=True)
data.shape

Only 309 records remain 

Next is to change class label to 1 and 0 for Abnormal and Normal respectively 

In [None]:
data['Class_att']=data['Class_att'].apply(lambda x : '1' if x=='Abnormal' else '0')

## Modelling 

### 1. Preparation 

Scalling, separating feature set and label set, separating training set and test set 

In [None]:
data_feature = data[data.columns.difference(['Class_att'])]
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_feature)
data_scaled= pd.DataFrame(data=scaled_data,columns=data_feature.columns)
data_scaled['Class_att']=data['Class_att']
data_scaled.describe()

In [None]:
X=data_scaled[data_scaled.columns.difference(['Class_att'])]
y=data_scaled['Class_att']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42)

### 2.1 K Nearest Neighbors 

In [None]:
neighbor = KNeighborsClassifier(n_neighbors=3)
neighbor.fit(X_train,y_train)
neighbor.score(X_test,y_test)

98% of accuracy. Not bad 
<br> However, due to unbalanced data, better look at confusion matrix

In [None]:
y_predict = neighbor.predict(X_test)

In [None]:
confusion_matrix(y_test,y_predict)

So good that only one mismatched label

### 2.2 Support Vector Machine 

In [None]:
Svm=svm.SVC()
Svm.fit(X_train,y_train)
Svm.score(X_test,y_test)

In [None]:
y_predict = Svm.predict(X_test)
confusion_matrix(y_test,y_predict)

Same result as K Nearest Neighbors 

### 2.3 Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
logreg.score(X_test,y_test)

In [None]:
y_predict = logreg.predict(X_test)
confusion_matrix(y_test,y_predict)

Apparently all three models can also predict the same results