In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Lower Back Pain Classification
**Author:** Khadija Hammawa<br>

**Date:** 19/05/2021<br>

**Table of Contents:**<br>
1. Overview
2. Import Data and Libraries
3. Data Cleaning
4. Data Preprocessing
5. Training
6. Training Results
7. Model Selection
8. Confusion Matrix



## 1. Project Overview 

Chronic lower back pain (CLBP) is a major cause of disability worldwide. CLBP prevalence in adults has increased by more than 100% in the last decade and continues to rise in older populations (Allegri et al. 2016). Given, the complexity of lower back pain the severity of symptoms can differ from person to person. For this reason, CLBP is often difficult to diagnose requiring complex clinical decision-making, which can still result in misdiagnosis (Allegri et al. 2016).
 
Considering the clinical importance of lower back pain, I have chosen the lower back pain dataset which contains various measurements of physical spine data. The purpose of this analysis is to predict whether a patient will display abnormal (pain) or normal (no pain) given physical spine data. 


## 2. Import Data and Libraries
When importing the raw data, we can see that the column names are not labeled, and the last column has no values (NaN). Therefore, we can read in the data by specifying column names.

In [None]:
# Python Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Split and Scale functions
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# ML models
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Confusion Matrix
from sklearn.metrics import confusion_matrix

# Resampling
from sklearn.utils import resample

# Metrics
from sklearn.metrics import accuracy_score, f1_score, classification_report

import warnings 
warnings.filterwarnings(action='ignore')

In [None]:
data = pd.read_csv('../input/lower-back-pain-symptoms-dataset/Dataset_spine.csv')

Let's take a look at the raw data

In [None]:
data

In [None]:
# Column names from dataset
cols = ['pelvic_incidence','pelvic_tilt','lumbar_lordosis_angle','sacral_slope','pelvic_radius',
        'degree_spondylolisthesis','pelvic_slope','direct_tilt','thoracic_slope','cervical_tilt',
        'sacrum_angle','scoliosis_slope','normality']

# Read in data with column names, 
data = pd.read_csv('../input/lower-back-pain-symptoms-dataset/Dataset_spine.csv', header=0, names=cols, usecols=range(13))

In [None]:
data

## 3. Data Cleaning

In this section, we will begin cleaning the data to check for missing values. However, because we are interested in features that will best help our model make predictions, we will first visualize a correlation heatmap. 
 
Using seaborn, we can see that five features are highly correlated: *pelvic incidence, pelvic tilt, lumbar lordosis angle, sacral slope, and degree spondylolisthesis*. For this reason, we can drop the remaining (i.e., low correlation) columns. 


In [None]:
corr = data.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, vmin=-1.0)
plt.title("Lower Back Pain Correlation Heatmap")
plt.show()

In [None]:
data.isna().sum()

In [None]:
low_corr_cols = ['pelvic_slope', 'direct_tilt', 'thoracic_slope', 'cervical_tilt', 'sacrum_angle', 'scoliosis_slope','pelvic_radius']

for i in low_corr_cols:
    data = data.drop(i, axis=1)

In [None]:
data

## 4. Preprocessing

Since there were no missing values, we can move onto preprocessing. First, I created a preprocessing function that will take in a DataFrame (df). This function will split the df into X and y-y contains the data from our target column (normality) while X will be the entire df without the target column. Next, we split X and y into train and test sets. Before, returning the train and test sets we will scale X such that all columns have a unit variance of 1 and a mean of 0.
 
Before training our model, we want to look at values in y_train. As we can see y_train has a class imbalance with 68% as abnormal and 32% normal. To address this, we will re-sample y_train such that the abnormal condition represents 58% of values and the normal condition represents 42%.


In [None]:
def preprocessing_inputs(df):
    df = df.copy()
    
    # Split df X and Y
    y = df['normality']
    X = df.drop('normality', axis=1)
    
    # Train-test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocessing_inputs(data)

In [None]:
X_train

In [None]:
print(y_train.value_counts())
print()
y_train.value_counts().sort_index().plot.bar()

## 5. Training
In this section, we train several models to find which model will make the best predictions

In [None]:
models = {
    "                 Logistic Regression": LogisticRegression(),
    "Logistic Regression Cross-Validation": LogisticRegressionCV(),
    "                       Decision Tree": DecisionTreeClassifier(),
    "                      Neural Network": MLPClassifier(),
    "                       Random Forest": RandomForestClassifier(),
    "                   Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + ' trained.')

## 6. Training Results
From training results, I choose to move forward with the Random Forest model. 

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f'{name} Accuracy: {acc * 100}')

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, pos_label='Abnormal')
    print(f'{name} F1-Score: {f1}')

## 7. Model Selection

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

## 8. Confusion Matrix

In [None]:
# Confusion Matrix
y_true = np.array(y_test) #expected
y_pred = model.predict(X_test) #predicted

In [None]:
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(6, 6))

sns.heatmap(cm, annot=True, vmin=0, fmt='g', cbar=False,cmap="YlGnBu", xticklabels=['Abnormal', 'Normal'], yticklabels=['Abnormal', 'Normal'])

plt.xlabel("Predicted")
plt.ylabel("Expected")
plt.title("Confusion Matrix")

plt.show()

In [None]:
print(classification_report(y_true, y_pred))

## References

1. Allegri, M., Montella, S., Salici, F., Valente, A., Marchesini, M., Compagnone, C., . . . Fanelli, G. (2016). Mechanisms of low back pain: A guide for diagnosis and therapy. F1000Research, 5, 1530. doi:10.12688/f1000research.8105.1