In [None]:
!pip install ppscore

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
import ppscore as pps

%matplotlib inline

rcParams['figure.figsize'] = 15, 8
pd.options.display.max_columns = None

In [None]:
data = pd.read_csv("/kaggle/input/fetal-health-classification/fetal_health.csv")

In [None]:
data.head(10)

In [None]:
data.isna().sum()

In [None]:
data.info()

Fetal Health

1 - Normal
2 - Suspect
3 - Pathological

In [None]:
data['fetal_health'].value_counts()

**The problem is now that the target variable is imbalanced and we have more normal than the other two labels**

In [None]:
sns.countplot(data['fetal_health'])
plt.show()

In [None]:
data.describe()

In [None]:
sns.heatmap(data.corr(), annot=True, fmt='.1f')
plt.show()

**Among the features, here a few features that can have significant impact on fetal_health**

1. prolongued_decelerations
2. abnormal_short_term_variability
3. accelerations
4. percentage_of_time_with_abnormal_long_term_variability

In [None]:
f, axes = plt.subplots(2, 3)
sns.barplot(x='fetal_health', y='abnormal_short_term_variability', data=data, ax=axes[0][0])
sns.barplot(x='fetal_health', y='prolongued_decelerations', data=data, ax=axes[0][1])
sns.barplot(x='fetal_health', y='accelerations', data=data, ax=axes[0][2])
sns.barplot(x='fetal_health', y='percentage_of_time_with_abnormal_long_term_variability', data=data, ax=axes[1][0])
sns.barplot(x='fetal_health', y='histogram_mode', data=data, ax=axes[1][1])
sns.barplot(x='fetal_health', y='uterine_contractions', data=data, ax=axes[1][2])
plt.show()

1. We can see that Pathological class has the highest abnormal_short_term_variability
2. prolongued_decelerations has the highest value for Pathological class
3. Normal class has highest accelerations, so if the accelerations is low then there is high chance of getting Suspect/Pathological fetus
4. Others doens't have obvious impact, as there are mix type of impact

**Where are going to try and build tree based model as there is some imbalanced data as tree based model can split the features based on the conditions**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score, f1_score

In [None]:
X = data.drop(['fetal_health'], axis=1)
y = data['fetal_health']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
X_train

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

Well that is not bad, good recall score, but predicting suspect class is still a bit less good

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
GBM_params =  { 
    'n_estimators': [200, 500, 800, 1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,10],
    'criterion' :['gini', 'entropy']
}

In [None]:
GBM_model = GridSearchCV(rfc, GBM_params, cv=10, n_jobs=-1, verbose=2).fit(X_train, y_train)

In [None]:
GBM_model.best_params_

In [None]:
upgraded_rfc = RandomForestClassifier(criterion='entropy', max_depth=10, max_features='sqrt', n_estimators=500)
upgraded_rfc.fit(X_train, y_train)
up_y_pred = upgraded_rfc.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))
print(classification_report(y_test, up_y_pred))

**Managed to increase the precision and f1-score but lossed 0.2 score for Suspect recall score**