In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Data processing and feature selection**

In [None]:
df = pd.read_csv('/kaggle/input/fetal-health-classification/fetal_health.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
X= df.drop('fetal_health',axis=1)
y = df['fetal_health']
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

In [None]:
import seaborn as sns

sns.countplot(df['fetal_health'])

**Our target column is skewed so we need to use stratified cv**

In [None]:
df.notnull().sum()

In [None]:
X.iloc[0,:]

In [None]:
import numpy as np
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import StratifiedKFold


skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=101)
lr = BernoulliNB()
lst_accu_stratified = []
print(skf)

for train_index, test_index in skf.split(X, y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    lr.fit(X_train_fold, y_train_fold) 
    lst_accu_stratified.append(lr.score(X_test_fold, y_test_fold)) 

In [None]:
# Print the output. 
print('List of possible accuracy:', lst_accu_stratified) 
print('\nMaximum Accuracy That can be obtained from this model is:', 
      max(lst_accu_stratified)*100, '%') 
print('\nMinimum Accuracy:', 
      min(lst_accu_stratified)*100, '%') 
print('\nOverall Accuracy:', 
      np.mean(lst_accu_stratified)*100, '%') 
print('\nStandard Deviation is:', np.std(lst_accu_stratified)) 

# We got base score of 84%

In [None]:
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

knn_model =  KNeighborsClassifier(n_neighbors=3)
rf_model = RandomForestClassifier()
gb_model = GradientBoostingClassifier()

In [None]:
knn_acc = []
rf_acc = []
gb_acc=[]

for train_index, test_index in skf.split(X, y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    knn_model.fit(X_train_fold, y_train_fold) 
    rf_model.fit(X_train_fold, y_train_fold) 
    gb_model.fit(X_train_fold, y_train_fold) 
    knn_acc.append(knn_model.score(X_test_fold, y_test_fold)) 
    rf_acc.append(rf_model.score(X_test_fold, y_test_fold)) 
    gb_acc.append(gb_model.score(X_test_fold, y_test_fold)) 
    
# Print the output. 
print('List of possible accuracy in KNN:', knn_acc) 
print('\nMaximum Accuracy That can be obtained from this model is:', 
      max(knn_acc)*100, '%') 
print('\nMinimum Accuracy:', 
      min(knn_acc)*100, '%') 
print('\nOverall Accuracy:', 
      np.mean(knn_acc)*100, '%') 
print('\nStandard Deviation is:', np.std(knn_acc)) 

print('\n\n')
print('List of possible accuracy in Random Forest:', rf_acc) 
print('\nMaximum Accuracy That can be obtained from this model is:', 
      max(rf_acc)*100, '%') 
print('\nMinimum Accuracy:', 
      min(rf_acc)*100, '%') 
print('\nOverall Accuracy:', 
      np.mean(rf_acc)*100, '%') 
print('\nStandard Deviation is:', np.std(rf_acc)) 

print('\n\n')
print('List of possible accuracy in Gradient boosting classifier:', gb_acc) 
print('\nMaximum Accuracy That can be obtained from this model is:', 
      max(gb_acc)*100, '%') 
print('\nMinimum Accuracy:', 
      min(gb_acc)*100, '%') 
print('\nOverall Accuracy:', 
      np.mean(gb_acc)*100, '%') 
print('\nStandard Deviation is:', np.std(gb_acc)) 

# Gradient boosting classifier performs best with 94.87% accuracy 

HYPERPARAMETER TUNNING

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'learning_rate':[0.1, 1,0.01],'n_estimators':[100,200,300]}
clf = GridSearchCV(gb_model, parameters)

In [None]:
sorted(clf.cv_results_.keys())

In [None]:
# gb_acc=[]

# for train_index, test_index in skf.split(X, y):
#     #print("TRAIN:", train_index, "TEST:", test_index)
#     X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
#     y_train_fold, y_test_fold = y[train_index], y[test_index]
#     clf.fit(X_train_fold, y_train_fold) 
#     gb_acc.append(clf.score(X_test_fold, y_test_fold)) 
    
# # Print the output. 
# print('List of possible accuracy in Gradient boosting classifier:', gb_acc) 
# print('\nMaximum Accuracy That can be obtained from this model is:', 
#       max(gb_acc)*100, '%') 
# print('\nMinimum Accuracy:', 
#       min(gb_acc)*100, '%') 
# print('\nOverall Accuracy:', 
#       np.mean(gb_acc)*100, '%') 
# print('\nStandard Deviation is:', np.std(gb_acc)) 

In [None]:
clf.best_estimator_

# We have got total 95.2% accuracy using Gradient boosting algorithm