# Back pain analysis using ML models_Spine Dataset_Shareena A

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Dataset_spine
**Biomedical data set built by Dr. Henrique da Mota during a medical residence period in the Group of Applied Research in Orthopaedics (GARO) of the Centre médico-chirurgical de réadaptation des Massues, Lyon, France.**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
df = pd.read_csv("../input/lower-back-pain-symptoms-dataset/Dataset_spine.csv")
df.head()

# Data preprocessing -------------------------------------------------------------------

**'unnamed:13'  contains informartion about this dataset such as name of each attribute of each column.so after modifing dataset we can delete column unnamed:13**

In [None]:
df["Unnamed: 13"].unique()

### Change the Column names

In [None]:
df.rename(columns = {"Col1" : "pelvic_incidence", "Col2" : "pelvic_tilt","Col3" : "lumbar_lordosis_angle",
                     "Col4" : "sacral_slope", "Col5" : "pelvic_radius","Col6" : "degree_spondylolisthesis", 
                     "Col7" : "pelvic_slope","Col8" : "direct_tilt","Col9" : "thoracic_slope",
                     "Col10" :"cervical_tilt", "Col11" : "sacrum_angle","Col12" : "scoliosis_slope", 
                     "Class_att" : "Spine_Condition"}, inplace=True)

df.head(3)

### Drop the Column named 'Unnamed: 13'

In [None]:
df.drop('Unnamed: 13', inplace=True, axis=1)
df.head(3)

### Target attribute = Spine_Conditions

In [None]:
df['Spine_Condition'].value_counts()

### Dataset overview

### Rows,columns and features list

In [None]:
print ("No of rows in dataset     =  " ,df.shape[0])
print ("No of Columns in dataset  =  " ,df.shape[1])

In [None]:
print('Features in dataset')
print('===================')
df.columns.tolist()

### Information about a DataFrame

In [None]:
print('Information about a DataFrame')
print('=============================')
df.info()

In [None]:
print("The number of continuous Variables are:- ",len(list(df.describe().columns)))
print("The number of Categorical Variables are:- ",len(df.select_dtypes(include=['object']).columns))

### Missing values

In [None]:
print("Missing values : \n",df.isnull().sum())
print('----------------------------')
print ("Total Missing values in the data set =  ", df.isnull().sum().sum())

### Finding the special characters in the data frame

In [None]:
print('If there is any special characters such as ?')
print('==================================')
print(df.isin(['?']).sum(axis=0))
print('----------------------------------')
print ("Total special characters in the data set =  ", df.isnull().sum().sum())

### catagorical column list

In [None]:
df.select_dtypes(include=['object']).columns.tolist()

### numeric column list

In [None]:
df.select_dtypes(include=np.number).columns.tolist()

### Scaling- MinMaxScaler
### ---------------------------------

- MinMaxScaler-Scales the data using the max and min values so that it fits between 0 and 1.

In [None]:
df.head(2)

In [None]:
X=df[['pelvic_incidence','pelvic_tilt','lumbar_lordosis_angle','sacral_slope','pelvic_radius','degree_spondylolisthesis',
 'pelvic_slope','direct_tilt','thoracic_slope','cervical_tilt','sacrum_angle','scoliosis_slope']]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(X)
scaled_df = pd.DataFrame(data = scaled_data, columns = X.columns)
scaled_df.head(2)

In [None]:
y=df['Spine_Condition']
df=scaled_df.join(y)
df.head(2)

# Exploratory Data Analysis --------------------------------------------------------------------

##  univariate analysis of  catagorical column
## -----------------------------------------------------------

#### Target variable = Spine_Condition

In [None]:
# count of target 
print('Target Class Distributuon')
print('-------------------------')
df['Spine_Condition'].value_counts()

In [None]:
plt.title('Frequency Distribution of Spine condition ')
sns.countplot(x='Spine_Condition', data=df,hue='Spine_Condition', palette= 'Set1')

**Target distribution in percentage**

In [None]:
df['Spine_Condition'].value_counts()*100.0 /len(df)

In [None]:
plt.title('Frequency Distribution of Spine condition ')
df['Spine_Condition'].value_counts().plot(kind='pie',autopct='%.2f')
plt.show()

- **Dataset is not imbalanced dataset**

## univariate analysis - Numerical columns
## -------------------------------------------------------------

### All numeric datas are Continuous datas in this data set

#### histogram

In [None]:
sns.set_theme(style='darkgrid')
df.plot(kind='hist', subplots=True, layout=(4,3), sharex=False ,figsize=(15,12),title = "Features Distribution")
plt.show()

- **The data are numerical.We can see the shape of the data’s distribution, especially when determining whether the output of a process is distributed approximately normally.**
---
- **'pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle','sacral_slope', 'pelvic_radius' are some what normally distributed**
- **degree_spondylolisthesis  right skewed**
---
- **'pelvic_slope','direct_tilt','thoracic_slope', 'cervical_tilt','sacrum_angle','scoliosis_slope' areA randomly distributed .**

In [None]:
sns.set_theme(style='darkgrid')
fig, ax = plt.subplots(2,3,figsize=(16,10))
sns.histplot(data=df,x='pelvic_incidence', hue='Spine_Condition', multiple="stack", ax=ax[0][0])
sns.histplot(data=df,x='pelvic_tilt', hue='Spine_Condition', multiple="stack", ax=ax[0][1])
sns.histplot(data=df,x='lumbar_lordosis_angle', hue='Spine_Condition', multiple="stack", ax=ax[0][2])
sns.histplot(data=df,x='sacral_slope', hue='Spine_Condition', multiple="stack", ax=ax[1][0]) 
sns.histplot(data=df,x='pelvic_radius', hue='Spine_Condition', multiple="stack", ax=ax[1][1])
sns.histplot(data=df,x='degree_spondylolisthesis', hue='Spine_Condition', multiple="stack", ax=ax[1][2])
plt.show()

In [None]:
sns.set_theme(style='white')
df.plot(kind='density', subplots=True, layout=(4,3), sharex=False ,figsize=(15,12))
plt.show()

- **Density plots are used to observe the distribution of a variable in a dataset.The peaks of a Density Plot help display where values are concentrated over the interval.**
---
- **from this we can see that 'pelvic_tilt','pelvic_radius' are perfectly normal distributed.**
- **'pelvic_incidence','lumbar_lordosis_angle','sacral_slope' are some what normally distributed**
- **other columns others are not at all normally distributed.**
---
- **degree_spondylolisthesis  right skewed or positively skewed. That means major collected datas are greater than 0**
---
- **'direct_tilt','sacrum_angle','scoliosis_slope'The bimodal distribution looks like the back of a two-humped camel.that means Double-Peaked or Bimodal distribution**

## Bivariate analysis -----------------------------------------------------
### For Quantitative vs Quantitative we can use Scatterplot and Regplot.
---

### Sctatter plot and regplot (pelvic_incidence v/s other variables)

In [None]:
sns.set_theme(style='white')
fig, ax = plt.subplots(4,3,figsize=(20,25))
sns.set_palette('Set1')

sns.scatterplot(data=df,ax=ax[0][0],y='pelvic_incidence', hue='Spine_Condition',x='pelvic_incidence')
sns.scatterplot(data=df,ax=ax[0][1],y='pelvic_incidence', hue='Spine_Condition',x='pelvic_tilt')
sns.scatterplot(data=df,ax=ax[0][2],y='pelvic_incidence', hue='Spine_Condition',x='lumbar_lordosis_angle')
sns.scatterplot(data=df,ax=ax[1][0],y='pelvic_incidence', hue='Spine_Condition',x='sacral_slope')
sns.scatterplot(data=df,ax=ax[1][1],y='pelvic_incidence', hue='Spine_Condition',x='pelvic_radius')
sns.scatterplot(data=df,ax=ax[1][2],y='pelvic_incidence', hue='Spine_Condition',x='degree_spondylolisthesis')
sns.scatterplot(data=df,ax=ax[2][0],y='pelvic_incidence', hue='Spine_Condition',x='pelvic_slope')
sns.scatterplot(data=df,ax=ax[2][1],y='pelvic_incidence', hue='Spine_Condition',x='direct_tilt')
sns.scatterplot(data=df,ax=ax[2][2],y='pelvic_incidence', hue='Spine_Condition',x='thoracic_slope')
sns.scatterplot(data=df,ax=ax[3][0],y='pelvic_incidence', hue='Spine_Condition',x='cervical_tilt')   
sns.scatterplot(data=df,ax=ax[3][1],y='pelvic_incidence', hue='Spine_Condition',x='sacrum_angle')
sns.scatterplot(data=df,ax=ax[3][2],y='pelvic_incidence', hue='Spine_Condition',x='scoliosis_slope')

plt.show()

In [None]:
sns.set_theme(style='white')
fig, ax = plt.subplots(4,3,figsize=(20,25))

sns.regplot(data=df,ax=ax[0][0],y='pelvic_incidence', color="g",x='pelvic_incidence')
sns.regplot(data=df,ax=ax[0][1],y='pelvic_incidence', color="g",x='pelvic_tilt')
sns.regplot(data=df,ax=ax[0][2],y='pelvic_incidence', color="g",x='lumbar_lordosis_angle')
sns.regplot(data=df,ax=ax[1][0],y='pelvic_incidence', color="g",x='sacral_slope')
sns.regplot(data=df,ax=ax[1][1],y='pelvic_incidence', color="g",x='pelvic_radius')
sns.regplot(data=df,ax=ax[1][2],y='pelvic_incidence', color="g",x='degree_spondylolisthesis')
sns.regplot(data=df,ax=ax[2][0],y='pelvic_incidence', color="g",x='pelvic_slope')
sns.regplot(data=df,ax=ax[2][1],y='pelvic_incidence', color="g",x='direct_tilt')
sns.regplot(data=df,ax=ax[2][2],y='pelvic_incidence', color="g",x='thoracic_slope')
sns.regplot(data=df,ax=ax[3][0],y='pelvic_incidence', color="g",x='cervical_tilt')   
sns.regplot(data=df,ax=ax[3][1],y='pelvic_incidence', color="g",x='sacrum_angle')
sns.regplot(data=df,ax=ax[3][2],y='pelvic_incidence', color="g",x='scoliosis_slope')

plt.show()

- **positive relationship between 'pelvic_incidence' and 'pelvic_tilt','lumbar_lordosis_angle','sacral_slope','degree_spondylolisthesis'.**
- **negative relationship between 'pelvic_incidence' and 'pelvic_radius'**
- **N0 significant relationship between 'pelvic_incidence' andother columns.**

### Sctatter plot and regplot (pelvic_tilt  v/s other variables)

In [None]:
fig, ax = plt.subplots(4,3,figsize=(20,25))
sns.set_palette('rainbow')

sns.scatterplot(data=df,ax=ax[0][0],y='pelvic_tilt', hue='Spine_Condition',x='pelvic_incidence')
sns.scatterplot(data=df,ax=ax[0][1],y='pelvic_tilt', hue='Spine_Condition',x='pelvic_tilt')
sns.scatterplot(data=df,ax=ax[0][2],y='pelvic_tilt', hue='Spine_Condition',x='lumbar_lordosis_angle')
sns.scatterplot(data=df,ax=ax[1][0],y='pelvic_tilt', hue='Spine_Condition',x='sacral_slope')
sns.scatterplot(data=df,ax=ax[1][1],y='pelvic_tilt', hue='Spine_Condition',x='pelvic_radius')
sns.scatterplot(data=df,ax=ax[1][2],y='pelvic_tilt', hue='Spine_Condition',x='degree_spondylolisthesis')
sns.scatterplot(data=df,ax=ax[2][0],y='pelvic_tilt', hue='Spine_Condition',x='pelvic_slope')
sns.scatterplot(data=df,ax=ax[2][1],y='pelvic_tilt', hue='Spine_Condition',x='direct_tilt')
sns.scatterplot(data=df,ax=ax[2][2],y='pelvic_tilt', hue='Spine_Condition',x='thoracic_slope')
sns.scatterplot(data=df,ax=ax[3][0],y='pelvic_tilt', hue='Spine_Condition',x='cervical_tilt')   
sns.scatterplot(data=df,ax=ax[3][1],y='pelvic_tilt', hue='Spine_Condition',x='sacrum_angle')
sns.scatterplot(data=df,ax=ax[3][2],y='pelvic_tilt', hue='Spine_Condition',x='scoliosis_slope')

plt.show()

In [None]:
fig, ax = plt.subplots(4,3,figsize=(20,25))

sns.regplot(data=df,ax=ax[0][0],y='pelvic_tilt', color="r",x='pelvic_incidence')
sns.regplot(data=df,ax=ax[0][1],y='pelvic_tilt', color="r",x='pelvic_tilt')
sns.regplot(data=df,ax=ax[0][2],y='pelvic_tilt', color="r",x='lumbar_lordosis_angle')
sns.regplot(data=df,ax=ax[1][0],y='pelvic_tilt', color="r",x='sacral_slope')
sns.regplot(data=df,ax=ax[1][1],y='pelvic_tilt', color="r",x='pelvic_radius')
sns.regplot(data=df,ax=ax[1][2],y='pelvic_tilt', color="r",x='degree_spondylolisthesis')
sns.regplot(data=df,ax=ax[2][0],y='pelvic_tilt', color="r",x='pelvic_slope')
sns.regplot(data=df,ax=ax[2][1],y='pelvic_tilt', color="r",x='direct_tilt')
sns.regplot(data=df,ax=ax[2][2],y='pelvic_tilt', color="r",x='thoracic_slope')
sns.regplot(data=df,ax=ax[3][0],y='pelvic_tilt', color="r",x='cervical_tilt')   
sns.regplot(data=df,ax=ax[3][1],y='pelvic_tilt', color="r",x='sacrum_angle')
sns.regplot(data=df,ax=ax[3][2],y='pelvic_tilt', color="r",x='scoliosis_slope')

plt.show()

- **positive relationship between 'pelvic_tilt' and 'pelvic_incidence','lumbar_lordosis_angle','degree_spondylolisthesis'.**
- **Very less significant positive relationship between 'pelvic_incidence' and 'pelvic_radius','sacral_slope',**
- **N0 significant relationship between 'pelvic_incidence' andother columns.**

### Sctatter plot and regplot  (lumbar_lordosis_angle v/s other variables)

In [None]:
fig, ax = plt.subplots(4,3,figsize=(20,25))
sns.set_palette('Set3')

sns.scatterplot(data=df,ax=ax[0][0],y='lumbar_lordosis_angle', hue='Spine_Condition',x='pelvic_incidence')
sns.scatterplot(data=df,ax=ax[0][1],y='lumbar_lordosis_angle', hue='Spine_Condition',x='pelvic_tilt')
sns.scatterplot(data=df,ax=ax[0][2],y='lumbar_lordosis_angle', hue='Spine_Condition',x='lumbar_lordosis_angle')
sns.scatterplot(data=df,ax=ax[1][0],y='lumbar_lordosis_angle', hue='Spine_Condition',x='sacral_slope')
sns.scatterplot(data=df,ax=ax[1][1],y='lumbar_lordosis_angle', hue='Spine_Condition',x='pelvic_radius')
sns.scatterplot(data=df,ax=ax[1][2],y='lumbar_lordosis_angle', hue='Spine_Condition',x='degree_spondylolisthesis')
sns.scatterplot(data=df,ax=ax[2][0],y='lumbar_lordosis_angle', hue='Spine_Condition',x='pelvic_slope')
sns.scatterplot(data=df,ax=ax[2][1],y='lumbar_lordosis_angle', hue='Spine_Condition',x='direct_tilt')
sns.scatterplot(data=df,ax=ax[2][2],y='lumbar_lordosis_angle', hue='Spine_Condition',x='thoracic_slope')
sns.scatterplot(data=df,ax=ax[3][0],y='lumbar_lordosis_angle', hue='Spine_Condition',x='cervical_tilt')   
sns.scatterplot(data=df,ax=ax[3][1],y='lumbar_lordosis_angle', hue='Spine_Condition',x='sacrum_angle')
sns.scatterplot(data=df,ax=ax[3][2],y='lumbar_lordosis_angle', hue='Spine_Condition',x='scoliosis_slope')

plt.show()

In [None]:
fig, ax = plt.subplots(4,3,figsize=(20,25))

sns.regplot(data=df,ax=ax[0][0],y='lumbar_lordosis_angle', color="g",x='pelvic_incidence')
sns.regplot(data=df,ax=ax[0][1],y='lumbar_lordosis_angle', color="g",x='pelvic_tilt')
sns.regplot(data=df,ax=ax[0][2],y='lumbar_lordosis_angle', color="g",x='lumbar_lordosis_angle')
sns.regplot(data=df,ax=ax[1][0],y='lumbar_lordosis_angle', color="g",x='sacral_slope')
sns.regplot(data=df,ax=ax[1][1],y='lumbar_lordosis_angle', color="g",x='pelvic_radius')
sns.regplot(data=df,ax=ax[1][2],y='lumbar_lordosis_angle', color="g",x='degree_spondylolisthesis')
sns.regplot(data=df,ax=ax[2][0],y='lumbar_lordosis_angle', color="g",x='pelvic_slope')
sns.regplot(data=df,ax=ax[2][1],y='lumbar_lordosis_angle', color="g",x='direct_tilt')
sns.regplot(data=df,ax=ax[2][2],y='lumbar_lordosis_angle', color="g",x='thoracic_slope')
sns.regplot(data=df,ax=ax[3][0],y='lumbar_lordosis_angle', color="g",x='cervical_tilt')   
sns.regplot(data=df,ax=ax[3][1],y='lumbar_lordosis_angle', color="g",x='sacrum_angle')
sns.regplot(data=df,ax=ax[3][2],y='lumbar_lordosis_angle', color="g",x='scoliosis_slope')

plt.show()

- **positive relationship between 'lumbar_lordosis_angle' and 'pelvic_incidence','pelvic_tilt','sacral_slope','degree_spondylolisthesis'.**
- **negative relationship between 'lumbar_lordosis_angle' and 'pelvic_radius`**
- **No significant relationship between 'lumbar_lordosis_angle' andother columns.**

###  regplot (pelvic_radius v/s other variables)

In [None]:
fig, ax = plt.subplots(4,3,figsize=(20,25))

sns.regplot(data=df,ax=ax[0][0],y='pelvic_radius', color="m",x='pelvic_incidence')
sns.regplot(data=df,ax=ax[0][1],y='pelvic_radius', color="m",x='pelvic_tilt')
sns.regplot(data=df,ax=ax[0][2],y='pelvic_radius', color="m",x='lumbar_lordosis_angle')
sns.regplot(data=df,ax=ax[1][0],y='pelvic_radius', color="m",x='sacral_slope')
sns.regplot(data=df,ax=ax[1][1],y='pelvic_radius', color="m",x='pelvic_radius')
sns.regplot(data=df,ax=ax[1][2],y='pelvic_radius', color="m",x='degree_spondylolisthesis')
sns.regplot(data=df,ax=ax[2][0],y='pelvic_radius', color="m",x='pelvic_slope')
sns.regplot(data=df,ax=ax[2][1],y='pelvic_radius', color="m",x='direct_tilt')
sns.regplot(data=df,ax=ax[2][2],y='pelvic_radius', color="m",x='thoracic_slope')
sns.regplot(data=df,ax=ax[3][0],y='pelvic_radius', color="m",x='cervical_tilt')   
sns.regplot(data=df,ax=ax[3][1],y='pelvic_radius', color="m",x='sacrum_angle')
sns.regplot(data=df,ax=ax[3][2],y='pelvic_radius', color="m",x='scoliosis_slope')

plt.show()

- **negative  relationship between 'pelvic_radius' and 'pelvic_incidence','lumbar_lordosis_angle','sacral_slope','degree_spondylolisthesis'.**
- **Very less significant positive relationship between 'pelvic_radius' and 'pelvic_tilt'**
- **No significant relationship between  'pelvic_radius' and remaining columns**

### Scatter plot (degree_spondylolisthesis v/s other variables)

In [None]:
fig, ax = plt.subplots(4,3,figsize=(20,25))
sns.set_palette('Set2')

sns.scatterplot(data=df,ax=ax[0][0],y='degree_spondylolisthesis', hue='Spine_Condition',x='pelvic_incidence')
sns.scatterplot(data=df,ax=ax[0][1],y='degree_spondylolisthesis', hue='Spine_Condition',x='pelvic_tilt')
sns.scatterplot(data=df,ax=ax[0][2],y='degree_spondylolisthesis', hue='Spine_Condition',x='lumbar_lordosis_angle')
sns.scatterplot(data=df,ax=ax[1][0],y='degree_spondylolisthesis', hue='Spine_Condition',x='sacral_slope')
sns.scatterplot(data=df,ax=ax[1][1],y='degree_spondylolisthesis', hue='Spine_Condition',x='pelvic_radius')
sns.scatterplot(data=df,ax=ax[1][2],y='degree_spondylolisthesis', hue='Spine_Condition',x='degree_spondylolisthesis')
sns.scatterplot(data=df,ax=ax[2][0],y='degree_spondylolisthesis', hue='Spine_Condition',x='pelvic_slope')
sns.scatterplot(data=df,ax=ax[2][1],y='degree_spondylolisthesis', hue='Spine_Condition',x='direct_tilt')
sns.scatterplot(data=df,ax=ax[2][2],y='degree_spondylolisthesis', hue='Spine_Condition',x='thoracic_slope')
sns.scatterplot(data=df,ax=ax[3][0],y='degree_spondylolisthesis', hue='Spine_Condition',x='cervical_tilt')   
sns.scatterplot(data=df,ax=ax[3][1],y='degree_spondylolisthesis', hue='Spine_Condition',x='sacrum_angle')
sns.scatterplot(data=df,ax=ax[3][2],y='degree_spondylolisthesis', hue='Spine_Condition',x='scoliosis_slope')

plt.show()

In [None]:
fig, ax = plt.subplots(4,3,figsize=(20,25))
sns.regplot(data=df,ax=ax[0][0],y='degree_spondylolisthesis', color="g",x='pelvic_incidence')
sns.regplot(data=df,ax=ax[0][1],y='degree_spondylolisthesis', color="g",x='pelvic_tilt')
sns.regplot(data=df,ax=ax[0][2],y='degree_spondylolisthesis', color="g",x='lumbar_lordosis_angle')
sns.regplot(data=df,ax=ax[1][0],y='degree_spondylolisthesis', color="g",x='sacral_slope')
sns.regplot(data=df,ax=ax[1][1],y='degree_spondylolisthesis', color="g",x='pelvic_radius')
sns.regplot(data=df,ax=ax[1][2],y='degree_spondylolisthesis', color="g",x='degree_spondylolisthesis')
sns.regplot(data=df,ax=ax[2][0],y='degree_spondylolisthesis', color="g",x='pelvic_slope')
sns.regplot(data=df,ax=ax[2][1],y='degree_spondylolisthesis', color="g",x='direct_tilt')
sns.regplot(data=df,ax=ax[2][2],y='degree_spondylolisthesis', color="g",x='thoracic_slope')
sns.regplot(data=df,ax=ax[3][0],y='degree_spondylolisthesis', color="g",x='cervical_tilt')   
sns.regplot(data=df,ax=ax[3][1],y='degree_spondylolisthesis', color="g",x='sacrum_angle')
sns.regplot(data=df,ax=ax[3][2],y='degree_spondylolisthesis', color="g",x='scoliosis_slope')
plt.show()

- **positive relationship between 'degree_spondylolisthesis' and 'pelvic_incidence','pelvic_tilt','lumbar_lordosis_angle','sacral_slope','degree_spondylolisthesis'.**
- **negative relationship between 'degree_spondylolisthesis' and 'pelvic_radius`**
- **No significant relationship between 'degree_spondylolisthesis' andother columns.**

## Bivariate analysis - Categorical v/s Quantitative numeric we can use violin plot .
## -----------------------------------------------------------------------------------------------------------------

#### violin plot

In [None]:
fig, ax = plt.subplots(4,3,figsize=(20,25))
sns.set_palette('Set2')

sns.violinplot( data=df ,ax=ax[0][0], x ='Spine_Condition', orient='v',y ='pelvic_incidence')
sns.violinplot( data=df ,ax=ax[0][1], x ='Spine_Condition', orient='v',y ='pelvic_tilt')
sns.violinplot( data=df ,ax=ax[0][2], x ='Spine_Condition', orient='v',y ='lumbar_lordosis_angle')
sns.violinplot( data=df ,ax=ax[1][0], x ='Spine_Condition', orient='v',y ='sacral_slope')
sns.violinplot( data=df ,ax=ax[1][1], x ='Spine_Condition', orient='v',y ='pelvic_radius')
sns.violinplot( data=df ,ax=ax[1][2], x ='Spine_Condition', orient='v',y ='degree_spondylolisthesis')  
sns.violinplot( data=df ,ax=ax[2][0], x ='Spine_Condition', orient='v',y ='pelvic_slope')
sns.violinplot( data=df ,ax=ax[2][1], x ='Spine_Condition', orient='v',y ='direct_tilt')
sns.violinplot( data=df ,ax=ax[2][2], x ='Spine_Condition', orient='v',y ='thoracic_slope')
sns.violinplot( data=df ,ax=ax[3][0], x ='Spine_Condition', orient='v',y ='cervical_tilt')
sns.violinplot( data=df ,ax=ax[3][1], x ='Spine_Condition', orient='v',y ='sacrum_angle')
sns.violinplot( data=df ,ax=ax[3][2], x ='Spine_Condition', orient='v',y ='scoliosis_slope')

plt.show()

- **A violin plot is a hybrid of a box plot and a kernel density plot, which shows peaks in the data.**
- **outliers are present**

## Multivariate Analisis - pairplot 
## -------------------------------------------

In [None]:
sns.pairplot(df ,hue='Spine_Condition',palette='Set2')
plt.show()

**From above pairplts there are only first 5 params that can help is classifing.
They are 'pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle','sacral_slope','pelvic_radius','degree_spondylolisthesis'.
Others, give mixed relations.**

In [None]:
df_selected = df[['pelvic_incidence','pelvic_tilt','lumbar_lordosis_angle','sacral_slope','pelvic_radius',
                 'degree_spondylolisthesis','Spine_Condition',]]

sns.set_style('white')
sns.pairplot(df_selected ,hue='Spine_Condition',palette='husl')
plt.show()

- **There are mainly first 6 parameters that can help is classifing.**
- **They are 'pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle','sacral_slope','pelvic_radius','degree_spondylolisthesis'.**
- **Others variables give mixed less significant relations.**

## Multivariate Analisis - heatmap
## --------------------------------------------

In [None]:
plt.subplots(figsize=(16,16))
sns.heatmap(df.corr(),cmap="viridis", annot=True , linewidths=.5, fmt= '.1f')
plt.show()

In [None]:
df_selected = df[['pelvic_incidence','pelvic_tilt','lumbar_lordosis_angle','sacral_slope',
                 'degree_spondylolisthesis','Spine_Condition',]]

plt.subplots(figsize=(10,10))
sns.heatmap(df_selected.corr(),cmap="viridis", annot=True , linewidths=.5, fmt= '.1f')
plt.show()

## Statistical Overview --------------------------------------------------------------------------------------------

## 5 number  statistical summary of numeric-typed columns

In [None]:
print('5 number  statistical summary of numeric-typed columns')
print('======================================================')
scaled_df.describe()

## summary of object-typed column

In [None]:
print('summary of oject-typed columns')
print('==============================')
df.describe(include =['O'])

### Normal v/s Abnormal ( mean,varience,standard deviation,median )

In [None]:
SpineCondition_summary = df.groupby('Spine_Condition')
SpineCondition_summary.mean().reset_index()

- The mean is the average or the most common value in a collection of numbers.It is also referred to as an expected value.
- In statistics, it is a measure of central tendency of a probability distribution along median and mode. 
- **for all columns mean of abnormal is high compared to others so abnormal is prominent in dataset.**

In [None]:
SpineCondition_summary = df.groupby('Spine_Condition')
SpineCondition_summary.var().reset_index()

- The variance measures the average degree to which each point differs from the mean (the average) of all data points.

In [None]:
SpineCondition_summary = df.groupby('Spine_Condition')
SpineCondition_summary.std().reset_index()

- Standard deviation looks at how spread out a group of numbers is from the mean, by looking at the square root of the variance. 
- A low standard deviation means that the data is very closely related to the average, thus very reliable. 
- A high standard deviation means that there is a large variance between the data and the statistical average, and is not as reliable.

In [None]:
SpineCondition_summary = df.groupby('Spine_Condition')
SpineCondition_summary.median().reset_index()

## Covarience

In [None]:
df.cov()

- Covariance tells whether both variables vary in the same direction (positive covariance) or in the opposite direction (negative covariance). Lie between -infinity and +infinity
- In the study of covariance only sign matters. 

## Correlation

In [None]:
df.corr()

- Here correlation between 	pelvic_incidence and pelvic_tilt = 0.629199
- i.e,They are directly proportional to each other.**They vary in the same direction with the factor of 0.629199**

In [None]:
# Generate a mask for the upper triangle (taken from seaborn example gallery)

mask = np.zeros_like(df.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.set_style('whitegrid')
plt.subplots(figsize = (15,12))
sns.heatmap(df.corr(), annot=True, mask = mask, cmap = 'Set1',linewidths=.5, fmt= '.1f',center = 0,square=True)

plt.title("Correlations Among Features", y = 1.03,fontsize = 20, pad = 40);

## Feature selection - using correlation matrix
## --------------------------------------------------------------

In [None]:
correlation = df.corr()
relevant_features_correlation =  correlation[correlation >0.5]
relevant_features_correlation

### ------------------------------------------------------------------------------------------------------------------------
## From this we can see that relevant_features are :-
## - pelvic_incidence
## - pelvic_tilt
## - lumbar_lordosis_angle
## - sacral_slope
## - degree_spondylolisthesis
### -----------------------------------------------------------------------------------------------------------------------

## Outlier - Boxplot for finding ouiliers
## ----------------------------------------------------

- One method for determining outliers is based on the quartiles of a data set. 
- Quartiles are values that partition the data set into four groups, each containing 25% of the measurements. 
- The lower quartile is designated by **Q1 or QL**; the middle quartile is the **median**; the upper quartile is designated by **Q3 or QU.**
- Interquartile range (IQR) – the distance between the lower and upper quartiles. **IQR = QU – QL = Q3 – Q1**
- The middle 50% , those in the interquartile range, fall inside the box. The vertical lines emanating from the box are called whiskers. 

In [None]:
plt.figure(figsize=(20,16))
sns.boxplot(data=df,palette= "Set3")
plt.show()

## Boxplot of highly correlated features
## ----------------------------------------

In [None]:
df0=df[['pelvic_incidence','pelvic_tilt','lumbar_lordosis_angle','sacral_slope','degree_spondylolisthesis',
        'Spine_Condition']]

plt.figure(figsize=(15,8))
sns.set()
sns.boxplot(data=df0,palette= "Set3",orient="h")
plt.show()

## Outliers treatment
## ---------------------------

- **Values less than QL – 1.5(IQR) or  greater than QU + 1.5(IQR)** are identified as **potential outliers** because they are extreme values that represent relatively rare. 
- Values less thanQL – 3(IQR) or greater than QU + 3(IQR) are called outliers.

In [None]:
q1=df0.quantile(q=0.25) # the above output brief's about the df regarding the 25th percentile / 1st Quartile.
q2=df0.quantile(q=0.50) # the above output brief's about the df regarding the 50 th percentile / 2nd Quartile.
q3=df0.quantile(q=0.75) # the above output brief's about the df regarding the 75th percentile / 3rd Quartile.

In [None]:
# iqr (q3 -q1)
iqr = q3 - q1
print(iqr)

In [None]:
## Outlier calculation , Q3+1.5IQR and Q1 + 1.5IQR 

Upper_limit =q3 + (1.5 * iqr)
Lower_limit =q1 - (1.5 * iqr)

print('Upper_limit')
print('------------ \n', Upper_limit)
print('======================================')
print('Lower_limit')
print('------------ \n', Lower_limit)

### Remove Outlier

In [None]:
df1 = df0[(df0["pelvic_incidence"] > Lower_limit["pelvic_incidence"])
             & (df0["pelvic_incidence"] < Upper_limit["pelvic_incidence"])]

df0.shape[0] - df1.shape[0]

In [None]:
df2 = df1[(df1["pelvic_tilt"] > Lower_limit["pelvic_tilt"])
             & (df1["pelvic_tilt"] < Upper_limit["pelvic_tilt"])]

df1.shape[0] - df2.shape[0]

In [None]:
df3 = df2[(df2["lumbar_lordosis_angle"] > Lower_limit["lumbar_lordosis_angle"])
             & (df2["lumbar_lordosis_angle"] < Upper_limit["lumbar_lordosis_angle"])]

df2.shape[0] - df3.shape[0]

In [None]:
df4 = df3[(df3["sacral_slope"] > Lower_limit["sacral_slope"])
             & (df3["sacral_slope"] < Upper_limit["sacral_slope"])]

df3.shape[0] - df4.shape[0]

In [None]:
df_final = df4[(df4["degree_spondylolisthesis"] > Lower_limit["degree_spondylolisthesis"])
             & (df4["degree_spondylolisthesis"] < Upper_limit["degree_spondylolisthesis"])]

df4.shape[0] - df_final.shape[0]

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(data=df_final, palette= "Set3",orient="h")
plt.show()

###  A few number of outliers still preset. But it is close to IQR. so we can neglect them because there is a chance for them to be true value.

In [None]:
df_final.shape[0]

In [None]:
df_final['Spine_Condition'].value_counts()*100.0 /len(df)

- **Dataset is not imbalanced.**

In [None]:
df_final.head()

### Converting catagorical data to numeric before model building
### -----------------------------------------------------------------------------------------

In [None]:
df_final['Spine_Condition'] =df_final['Spine_Condition'].map({'Abnormal': 0, 'Normal': 1}).astype(int)
df_final.head(2)

## Scaling using StandardScaler ------------------------------------------------------------------------------------------

- The idea behind StandardScaler is that it will transform your data such that its distribution will have a mean value 0 and standard deviation of 1. This will increase accuracy.

In [None]:
X=df_final[['pelvic_incidence','pelvic_tilt','lumbar_lordosis_angle','sacral_slope','degree_spondylolisthesis']]
y=df_final['Spine_Condition']

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled_data = scaler.fit_transform(X)
X_scaled = pd.DataFrame(data = X_scaled_data, columns = X.columns)
X_scaled.head()

# Model building ----------------------------------------------------------------------------------------

In [None]:
X=X_scaled
y=df_final['Spine_Condition']

### Train Test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
print('No of rows in X = ',X.shape[0])
print('No of rows in X_train = ',X_train.shape[0])
print('No of rows in X_test = ',X_test.shape[0])
print('-------------------------------')
print('No of rows in y = ',y.shape[0])
print('No of rows in y_train = ',y_train.shape[0])
print('No of rows in y_test = ',y_test.shape[0])

## Model 1 : LogisticRegression ---------------------------------------------------------------------------------------------------------

In [None]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression(C=1)

logReg_model=logReg.fit(X_train, y_train)

# predict class
logReg_y_pred = logReg.predict(X_test)

logReg_train_score = logReg_model.score(X_train, y_train)
logReg_test_score  = logReg_model.score(X_test, y_test)

print("LogisticRegression :- ")
print ("Training Score: {}\nTest Score: {}" .format(logReg_train_score, logReg_test_score))

In [None]:
from sklearn.metrics import confusion_matrix
plt.title('confusion_matrix of logistic regression')
sns.heatmap(confusion_matrix(y_test,logReg_y_pred ),annot=True,fmt="d")
plt.show()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test,logReg_y_pred))

# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, logReg_y_pred)

## Model 2 : Gaussian Naive Bayes model ---------------------------------------------------------------------------------------------------------

In [None]:
#Fit Naive Bayes model (Skip MultinomialNB when some of the features have negative values. Use GaussianNB instead.)
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb_model = gnb.fit(X_train,y_train)

gnb_y_pred = gnb.predict(X_test)

gnb_train_score = gnb_model.score(X_train, y_train)
gnb_test_score  = gnb_model.score(X_test, y_test)

print("Naive Bayes Model:- ")
print ("Training Score: {}\nTest Score: {}" .format(gnb_train_score, gnb_test_score))

In [None]:
plt.title('confusion_matrix of Gaussian Naive Bayes')
sns.heatmap(confusion_matrix(y_test,gnb_y_pred ),annot=True,fmt="d")
plt.show()

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,gnb_y_pred))

# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, gnb_y_pred)

## Model 3: DecisionTreeClassifier ---------------------------------------------------------------------------------------------------------

In [None]:
from sklearn.tree import DecisionTreeClassifier  
dtree = DecisionTreeClassifier()

dtree_model= dtree.fit(X_train,y_train)

dtree_y_pred = dtree.predict(X_test)

dtree_train_score = dtree_model.score(X_train, y_train)
dtree_test_score  = dtree_model.score(X_test, y_test)

print("DecisionTreeClassifier :- ")
print ("Training Score: {}\nTest Score: {}" .format(dtree_train_score, dtree_test_score))


In [None]:
plt.title('confusion_matrix of decision tree')
sns.heatmap(confusion_matrix(y_test,dtree_y_pred ),annot=True,fmt="d")
plt.show()

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,dtree_y_pred))

# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, dtree_y_pred)

## Model 4: SVC --------------------------------------------------------------------------------------------------------

In [None]:
#Fit SVM model - does not do as good probably bc SVM are better when there are alot of features 
from sklearn.svm import SVC

svcclassifier = SVC(kernel = 'linear', random_state = 1)

svc_model = svcclassifier.fit(X_train, y_train)

svc_y_pred = svc_model.predict(X_test)

svc_train_score = svc_model.score(X_train, y_train)
svc_test_score  = svc_model.score(X_test, y_test)

print("SVM Model:- ")
print ("Training Score: {}\nTest Score: {}" .format(svc_train_score, svc_test_score))

In [None]:
plt.title('confusion_matrix of SVC')
sns.heatmap(confusion_matrix(y_test,dtree_y_pred ),annot=True,fmt="d")
plt.show()

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,svc_y_pred))

# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, svc_y_pred)

## Model 5 : SVC Tunning using GridSearchCv --------------------------------------------------------------------------------------------------------

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid={'C':[0.1,1,10,50,100,500],'gamma':[1,0.5,0.1,0.01,0.001]}

grid= GridSearchCV(SVC(),param_grid, verbose=3, n_jobs=-1)

grid.fit(X_train,y_train)

grid.best_params_

#### {'C': 500, 'gamma': 0.001}

In [None]:
svc_model_tunned = SVC(C=500, gamma=0.001)
svc_model_tunned.fit(X_train,y_train)

GridSearchCV_y_pred = svc_model_tunned.predict(X_test)

svc_tunned_train_score = svc_model_tunned.score(X_train, y_train)
svc_tunned_test_score  = svc_model_tunned.score(X_test, y_test)

print("SVM Model tunned:- ")
print ("Training Score: {}\nTest Score: {}" .format(svc_tunned_train_score, svc_tunned_test_score))

In [None]:
plt.title('confusion_matrix of GridSearchCV(SVC)')
sns.heatmap(confusion_matrix(y_test,GridSearchCV_y_pred ),annot=True,fmt="d")
plt.show()

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,GridSearchCV_y_pred))

# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, GridSearchCV_y_pred)

## Model 6 :K Nearest Neighbor model ---------------------------------------------------------------------------------------------------------

In [None]:
#Fit Nearest Neighbor model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)

knn_model = knn.fit(X_train,y_train)

knn_y_pred = knn_model.predict(X_test)

knn_train_score = knn_model.score(X_train, y_train)
knn_test_score  = knn_model.score(X_test, y_test)

print("K Nearest Neighbor Model:- ")
print ("Training Score: {}\nTest Score: {}" .format(knn_train_score, knn_test_score))

In [None]:
plt.title('confusion_matrix of k Nearest Neighbor')
sns.heatmap(confusion_matrix(y_test,knn_y_pred ),annot=True,fmt="d")
plt.show()

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,knn_y_pred))

# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, knn_y_pred)

## Selecting the final model --------------------------------------------------------------------------------------------------------

In [None]:
models=pd.DataFrame({'Model':['Logistic Regression','Gaussian Naive Bayes','Decision Tree','SVC','GridSearchCV(SVC)','knn'],
                    'Accuracy': [ logReg_test_score, gnb_test_score, dtree_test_score ,svc_test_score,
                                    svc_tunned_test_score,knn_test_score] })
models.sort_values(by='Accuracy', ascending=False)

In [None]:
sns.barplot(x=models["Model"],y=models["Accuracy"],data=models,palette="Set2")
plt.xticks(rotation=-35)
plt.ylim(0.6, 1)

### SVC model has high accuracy.