### Read the data.

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('../input/Dataset_spine.csv')

In [3]:
data.head()

### Drop the unnecessary field and map the 'Class attr' to categorical values.

In [4]:
data.drop(['Unnamed: 13'], axis=1, inplace=True)

data.head()

In [5]:
data['Class_att'] = data['Class_att'].map({'Abnormal': 1, 'Normal': 0})

data.head()

### Renaming the column.

In [6]:
data = data.rename(columns={'Col1': 'pelvic_incidence', 
                            'Col2': 'pelvic_tilt', 
                            'Col3': 'lumbar_lordosis_angle', 
                            'Col4': 'sacral_slope', 
                            'Col5': 'pelvic_radius', 
                            'Col6': 'degree_spondylolisthesis', 
                            'Col7': 'pelvic_slope', 
                            'Col8': 'direct_tilt', 
                            'Col9': 'thoracic_slope', 
                            'Col10': 'cervical_tilt', 
                            'Col11': 'sacrum_angle', 
                            'Col12': 'scoliosis_slope', 
                            'Class_att': 'class'})

In [7]:
data.head()

In [8]:
data.info()

In [9]:
data.describe()

### Basic exploratory data analysis to find correlation between different features

In [10]:
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
%matplotlib inline
sns.set_style('whitegrid')

In [12]:
plt.figure(figsize=(12,9))
sns.heatmap(data.corr(), annot=True)

In [13]:
sns.pairplot(data, hue='class', palette='Set1')

### Data seems to be distributed quite randomly. No noticeable relationship between different features. Additional exploratory analysis might be useful.

In [14]:
sns.countplot(x='class', data=data, palette='Set2')

### 'Abnormal' class is almost double of 'Normal' class. Over-sampling or under-sampling techniques to handle imbalanced data can be used.

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

### Normalising the data.

In [16]:
scaler = StandardScaler()

y = data['class'].values
X = scaler.fit_transform(data[data.columns[:-1]])

### Using PCA to reduce dimensionality of the data.

In [17]:
var = []
for n in range(1, 12):
    pca = PCA(n_components=n)
    pca.fit(X)
    var.append(np.sum(pca.explained_variance_ratio_))

In [18]:
plt.figure(figsize=(10,6))
plt.plot(range(1,12), var, color='red', linestyle='dashed', marker='o', markerfacecolor='black', markersize=10)
plt.title('Variance vs. Components')
plt.xlabel('Components')
plt.ylabel('Variance')

### After plotting retained variance against number of components, we can see that 85% and above variance is retained only if we keep principal components greater than 8.

### So, we won't apply PCA to the data.

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101)

### Split the data and feed it to the TPOT classifer, which select the best pipeline for the data

In [21]:
from tpot import TPOTClassifier

In [22]:
pipeline = TPOTClassifier(generations=20, population_size=100, cv=5, n_jobs=-1, random_state=101, verbosity=2)

In [23]:
pipeline.fit(X_train, y_train)

In [24]:
pipeline.score(X_test, y_test)

### ~87% accuracy using Logistic Regression.

In [25]:
from sklearn.metrics import classification_report, confusion_matrix

In [26]:
y_pred = pipeline.predict(X_test)

In [27]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### Training a deep neural network .

In [28]:
import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential

In [29]:
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(12,)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

### Add the layers to network. Also add dropout layer to avoid over-fitting.

In [30]:
model.summary()

In [31]:
model.fit(X_train, y_train, batch_size=32, epochs=1000, verbose=2, validation_split=0.2)

In [32]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

### ~88% accuracy. Maybe using more units, different optimizer can give us better accuracy.