#            **Prediction of Kyphosis**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
## Load dataset
df = pd.read_csv('/kaggle/input/kyphosis-dataset/kyphosis.csv')
df.head()

In [None]:
df.shape

In [None]:
## Discriptive Stats
df.describe()

In [None]:
## Checking Missing Value
df.isnull().sum()

In [None]:
df['Kyphosis'].value_counts()

In [None]:
sns.countplot(x='Kyphosis', data=df)

In [None]:
sns.pairplot(df,hue="Kyphosis")

In [None]:
## Feature Encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Kyphosis'] = le.fit_transform(df['Kyphosis'])
df.head()

In [None]:
x = df.iloc[:,1:] ## Independent Variables
y = df.iloc[:,0] ## Dependent Variable

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

lr_y_pred = lr_model.predict(x_test)

print('Accuracy Score:', accuracy_score(y_test, lr_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, lr_y_pred))
print('Classification Report: \n', classification_report(y_test, lr_y_pred))

# KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf_model = KNeighborsClassifier(n_neighbors=2, weights='distance')
knn_clf_model.fit(x_train, y_train)

In [None]:
knn_y_pred = knn_clf_model.predict(x_test)

print('Accuracy Score: \n', accuracy_score(y_test, knn_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, knn_y_pred))
print('Classification Report: \n', classification_report(y_test, knn_y_pred))

# Decession Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_clf_model = DecisionTreeClassifier()
dt_clf_model.fit(x_train, y_train)

In [None]:
dt_y_pred = dt_clf_model.predict(x_test)

print('Accuracy Score: \n', accuracy_score(y_test, dt_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, dt_y_pred))
print('Classification Report: \n', classification_report(y_test, dt_y_pred))

# Ranom Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf_model = RandomForestClassifier()
rf_clf_model.fit(x_train, y_train)

In [None]:
rf_y_pred = rf_clf_model.predict(x_test)

print('Accuracy Score: \n', accuracy_score(y_test, rf_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, rf_y_pred))
print('Classification Report: \n', classification_report(y_test, rf_y_pred))

# SVM Classifier

In [None]:
from sklearn import svm

svm_clf_model = svm.SVC(kernel='linear')
svm_clf_model.fit(x_train, y_train)

In [None]:
svm_y_pred = svm_clf_model.predict(x_test)

print('Accuracy Score: \n', accuracy_score(y_test, svm_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, svm_y_pred))
print('Classification Report: \n', classification_report(y_test, svm_y_pred))

# Ada Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf_model = AdaBoostClassifier(random_state=100)
ada_clf_model.fit(x_train, y_train)

In [None]:
ada_y_pred = ada_clf_model.predict(x_test)

print('Accuracy Score: \n', accuracy_score(y_test, ada_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, ada_y_pred))
print('Classification Report: \n', classification_report(y_test, ada_y_pred))

# Ada Boost With Random Forest

In [None]:
ada_rf_model = AdaBoostClassifier(base_estimator=rf_clf_model)
ada_rf_model.fit(x_train, y_train)

In [None]:
ada_rf_y_pred = ada_rf_model.predict(x_test)

print('Accuracy Score: \n', accuracy_score(y_test, ada_rf_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, ada_rf_y_pred))
print('Classification Report: \n', classification_report(y_test, ada_rf_y_pred))

# Ada Boost with Logistic Regression

In [None]:
ada_lr_model = AdaBoostClassifier(base_estimator=lr_model)
ada_lr_model.fit(x_train, y_train)

In [None]:
ada_lr_y_pred = ada_lr_model.predict(x_test)

print('Accuracy Score: \n', accuracy_score(y_test, ada_lr_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, ada_lr_y_pred))
print('Classification Report: \n', classification_report(y_test, ada_lr_y_pred))

# Hyper Parameter Tunning Random Forest

# Randomized Search CV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(200, 2000, 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, 10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7, 10, 14]
# Minimum number samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 8]
# Create a random grid
random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'criterion': ['entropy', 'gini']

}
print(random_grid)

In [None]:
rf = RandomForestClassifier()
rf_randomcv = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=100, n_jobs=-1)
rf_randomcv.fit(x_train, y_train)

In [None]:
rf_randomcv.best_params_

In [None]:
rf_randomcv_model = rf_randomcv.best_estimator_
rf_randomcv_model.fit(x_train, y_train)

In [None]:
rf_randomcv_model_y_pred = rf_randomcv_model.predict(x_test)

print('Accuracy Score: \n', accuracy_score(y_test, rf_randomcv_model_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, rf_randomcv_model_y_pred))
print('Classification Report: \n', classification_report(y_test, rf_randomcv_model_y_pred))

# Gridsearch CV

In [None]:
rf_randomcv.best_params_

In [None]:
# Create a param grid
param_grid = {'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 100, rf_randomcv.best_params_['n_estimators'],                                             rf_randomcv.best_params_['n_estimators'] + 100, rf_randomcv.best_params_['n_estimators'] + 200],
                'max_features': [rf_randomcv.best_params_['max_features']],
                'max_depth': [rf_randomcv.best_params_['max_depth']],
                'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] -2, rf_randomcv.best_params_['min_samples_split']-1, rf_randomcv.best_params_['min_samples_split'], rf_randomcv.best_params_['min_samples_split'] +1, rf_randomcv.best_params_['min_samples_split'] +2,],
                'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], rf_randomcv.best_params_['min_samples_leaf']+2,                                          rf_randomcv.best_params_['min_samples_leaf'] + 4],
                'criterion': [rf_randomcv.best_params_['criterion']]

}
print(param_grid)

In [None]:
from sklearn.model_selection import GridSearchCV

rf2 = RandomForestClassifier()
rf_gridcv = GridSearchCV(estimator=rf2, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
rf_gridcv.fit(x_train, y_train)

In [None]:
rf_gridcv.best_params_

In [None]:
rf_gridcv_model = rf_gridcv.best_estimator_
rf_gridcv_model.fit(x_train, y_train)

In [None]:
rf_gridcv_y_pred = rf_gridcv_model.predict(x_test)

print('Accuracy Score: \n', accuracy_score(y_test, rf_gridcv_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, rf_gridcv_y_pred))
print('Classification Report: \n', classification_report(y_test, rf_gridcv_y_pred))

# SVM Hyper parameter tunning

In [None]:
parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, 
                {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]

svc_grid_search = GridSearchCV(estimator=svm_clf_model, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1, verbose=2)
svc_grid_search.fit(x_train, y_train)

In [None]:
svc_grid_search.best_params_

In [None]:
svc_grid_search_model = svc_grid_search.best_estimator_
svc_grid_search_model.fit(x_train, y_train)

In [None]:
svc_grid_search_y_pred = svc_grid_search_model.predict(x_test)

print('Accuracy Score: \n', accuracy_score(y_test, svc_grid_search_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, svc_grid_search_y_pred))
print('Classification Report: \n', classification_report(y_test, svc_grid_search_y_pred))