In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,plot_confusion_matrix,classification_report, matthews_corrcoef
from sklearn.metrics import recall_score,precision_score,f1_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv("/kaggle/input/fetal-health-classification/fetal_health.csv")

#### Data Overview and Understanding it


In [None]:
print(data.columns)
data.head()

In [None]:
for col in data.columns:
    print(col, data[col].nunique())

In [None]:
# these three have the least unique values -- can be considered as categorical
print(data['fetal_health'].unique())
print(data['histogram_tendency'].unique())
print(data['severe_decelerations'].unique())

In [None]:
data.describe()

In [None]:
#checking for null values
data.info()

# EDA

In [None]:
#checking for class Imbalance
sns.histplot(data['fetal_health'])

In [None]:
data["fetal_health"].value_counts()

In [None]:
total = data["fetal_health"].sum()
normal = total - 471
suspect = total - 1831
pathological = total - 1950

pie_fetal_health = plt.pie([normal, suspect, pathological], labels=["Normal", "Suspect", "Pathological"], colors = ["#5F9EA0", "#B0E0E6", "#ADD8E6"],explode=[0.01,0.01,0.01], autopct="%1.0f%%")
plt.title('Pie chart of Fetal Heath', fontsize = 15)

In [None]:
plt.figure(figsize=(25,25))
i=1
for col in data.columns:
    plt.subplot(6,4,i)
    sns.boxplot(x = 'fetal_health', y = col, data = data)
    plt.tight_layout()
    plt.title(col,fontsize=18)
    i+=1

> ### Most of the box plots shows that normal (class 1) and suspect (class 2) are highly overlapping

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(data.corr(), square="True", annot=True, cmap= "coolwarm")
plt.show()

In [None]:
X, y = data.drop(['fetal_health'], axis=1), data['fetal_health']

In [None]:
## Checking for LDA variations to see the separability
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=2)

lda_data_mod = lda.fit_transform(X,y)
lda_data = pd.DataFrame(data=lda_data_mod, columns=['C1', 'C2'])

lda_data = pd.concat([lda_data, y], axis=1)
sns.lmplot(x='C1', y='C2', data=lda_data, hue='fetal_health', fit_reg=False)
lda.explained_variance_ratio_

> ### "This requires stronger classifiers"

#### * Case 1: direct classification, letting epochs handle the class imbalance
#### * Case 2: dividing the problem into two sub problems, first classify 1 vs (2,3) and then 2 vs 3 --> handling class imbalance

Comparing models -- SVC, Rain Forest Classifier, Stochastic Gradient Boosting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

> ### Support Vector Classification

In [None]:
from sklearn.svm import SVC
svc = SVC(gamma='scale')
svc.fit(X_train, y_train)
svc_score = svc.score(X_test, y_test)
print('accuracy is: ', svc_score*100)
y_pred = svc.predict(X_test)

print("Accuracy score: {}".format(accuracy_score(y_test,y_pred)))
print("Classification report:\n {}".format(classification_report(y_test,y_pred)))
print("Matthew Correlation Coefficient Score: {}".format(matthews_corrcoef(y_test,y_pred)))
plot_confusion_matrix(svc, X_test, y_test, cmap='Blues')

> ### RainForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
trees = [140,150,160,180]
for i in trees:
    clf = RandomForestClassifier(n_estimators=i, criterion="gini", max_depth= 5, random_state=0)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    acc_sc = accuracy_score(y_test, preds, normalize=True)
    f1_sc = f1_score(y_test, preds, labels=[1,2,3], pos_label=1, average=None, zero_division='warn')
    print("accuracy ", acc_sc)
    print("f1_score ", f1_sc)

> ### Grid Search for RainForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
parameters = { 
    'n_estimators': [140,150,160,180],
    'max_features': ['auto'],
    'max_depth' : [4,6,8],
    'criterion' :['gini', 'entropy'],
    'n_jobs':[-1,None]
}

#Fitting the trainingset to find parameters with best accuracy

CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameters, cv= 5)
CV_rfc.fit(X_train, y_train)

print('Best params are: ', CV_rfc.best_params_)

RF_model = RandomForestClassifier(**CV_rfc.best_params_)
RF_model.fit(X_train, y_train)
#Testing the Model on test set
predictions=RF_model.predict(X_test)

print("Accuracy score: {}".format(accuracy_score(y_test,predictions)))
print("Classification report:\n {}".format(classification_report(y_test,predictions)))
print("Matthew Correlation Coefficient Score: {}".format(matthews_corrcoef(y_test,predictions)))
plot_confusion_matrix(CV_rfc, X_test, y_test, cmap='Blues')

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neighbors = [1,3,5,7,9]

for i in neighbors:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    
    predsk = knn.predict(X_test)
    
    acc_sc_knn = accuracy_score(y_test, predsk, normalize=True)
    f1_sc = f1_score(y_test, predsk, labels=[1,2,3], pos_label=1, average=None, zero_division='warn')
    print("accuracy ", acc_sc)
    print("f1_score ", f1_sc)

In [None]:
# in xbg the classes must start from 0
y_ = y[:]-1
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y_, test_size=0.25, random_state=42)

from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate = 0.1, max_depth = 5, n_estimators = 275, use_label_encoder=False, verbosity=0)
xgb.fit(X1_train, y1_train)
y1_pred = xgb.predict(X1_test)

print("Accuracy score: {}".format(accuracy_score(y1_test,y1_pred)))
print("Classification report:\n {}".format(classification_report(y1_test,y1_pred)))
print("Matthew Correlation Coefficient Score: {}".format(matthews_corrcoef(y1_test,y1_pred)))
plot_confusion_matrix(xgb, X1_test, y1_test, cmap='Blues')

> ### Stochastic Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(subsample=0.9, max_features=0.7, n_estimators=150, random_state=42)
gbc.fit(X_train, y_train)
stochastic_boost_score = gbc.score(X_test, y_test)
print('accuracy is: ', stochastic_boost_score*100)
y_pred = gbc.predict(X_test)

print("Accuracy score: {}".format(accuracy_score(y_test,y_pred)))
print("Classification report:\n {}".format(classification_report(y_test,y_pred)))
print("Matthew Correlation Coefficient Score: {}".format(matthews_corrcoef(y_test,y_pred)))
plot_confusion_matrix(gbc, X_test, y_test, cmap='Blues')

## Stochastic Gradient Boosting performed the best in separation, So for case 2, Only Stochastic Gradient Boosting will be used

In [None]:
# converting to a binary class row
def convertBinary(row):
    if row['fetal_health'] == 1:
        return 1
    else:
        return 2
data['binary_y'] = data.apply(convertBinary, axis=1)
data.head()

In [None]:
# splitting data for the sub cases
Xb, yb = data.drop(['binary_y'], axis=1), data['binary_y']
Xb_train, Xb_test, yb_train, yb_test = train_test_split(Xb, yb, test_size=0.25, random_state=42)

data2 = pd.concat([Xb_train, yb_train], axis=1)
data2 = data2[data2['binary_y'] == 2]

data3 = pd.concat([Xb_test, yb_test], axis=1)
data3 = data3[data3['binary_y'] == 2]

Xbb_train = data2.drop(['fetal_health', 'binary_y'], axis=1)
ybb_train = data2.fetal_health

Xbb_test = data3.drop(['fetal_health', 'binary_y'], axis=1)
ybb_test = data3.fetal_health

Xb_train = Xb_train.drop(['fetal_health'], axis=1)
Xb_test = Xb_test.drop(['fetal_health'], axis=1)

In [None]:
gbc2 = GradientBoostingClassifier(loss='deviance',subsample=0.9, max_features=0.7, n_estimators=150, learning_rate=0.4, max_depth=4, random_state=42)
gbc2.fit(Xb_train, yb_train)
stochastic_boost_score2 = gbc2.score(Xb_test, yb_test)
# print('accuracy is: ', stochastic_boost_score2*100)
yb_pred = gbc2.predict(Xb_test)

print("Accuracy score: {}".format(accuracy_score(yb_test,yb_pred)))
print("Classification report:\n {}".format(classification_report(yb_test,yb_pred)))
print("Matthew Correlation Coefficient Score: {}".format(matthews_corrcoef(yb_test,yb_pred)))
plot_confusion_matrix(gbc2, Xb_test, yb_test, cmap='Blues')

In [None]:
gbc3 = GradientBoostingClassifier(loss='deviance',subsample=0.9, max_features='auto', n_estimators=200, learning_rate=0.2, max_depth=4, random_state=42)
gbc3.fit(Xbb_train, ybb_train)
stochastic_boost_score3 = gbc3.score(Xbb_test, ybb_test)
print('accuracy is: ', stochastic_boost_score3*100)
ybb_pred = gbc3.predict(Xbb_test)

print("Accuracy score: {}".format(accuracy_score(ybb_test,ybb_pred)))
print("Classification report:\n {}".format(classification_report(ybb_test,ybb_pred)))
print("Matthew Correlation Coefficient Score: {}".format(matthews_corrcoef(ybb_test,ybb_pred)))
plot_confusion_matrix(gbc3, Xbb_test, ybb_test, cmap='Blues')

> ### Final F1_score of case 2 -- 98, 93 x 99 = 92.07, 93 x 97 = 90.21 
> ### There's slight increase in class 2's F1_score and decrease in class 3's. So we can say, there's not much of a difference.