---
## Fetal Health Classification Problem
---
### Aurthor: Avinash Bagul
##### MSc Artificial Intelligence (University of Aberdeen)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/fetal-health-classification/fetal_health.csv')
df.head(5)

Checking for number of missing values in each column.....

In [None]:
import missingno as msno
n = msno.bar(df,color="gray")
print(n)

Description of the Data

In [None]:
df.describe()

In [None]:
df.info()

Distribution of Target class: Highly imbalanced

In [None]:
sns.countplot(x="fetal_health",data = df)
plt.show()

Looking for outliers in the data

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(data = df,palette = "Set1")
plt.xticks(rotation=90)
plt.show()

Removing the outliers: by setting upper and lower threshold

In [None]:
# Function to set upper and lower bound to 3rd standard deviation and remove outliers

def removeOutlier(att, df):

    lowerbound = att.mean() - 3 * att.std()
    upperbound = att.mean() + 3 * att.std()

    print('lowerbound: ',lowerbound,' -------- upperbound: ', upperbound )

    df1 = df[(att > lowerbound) & (att < upperbound)]

    print((df.shape[0] - df1.shape[0]), ' number of outliers from ', df.shape[0] )
    print(' ******************************************************')
    
    df = df1.copy()

    return df

Removing outliers from columns showing outiers in the boxplot visualized above

In [None]:
df = removeOutlier(df.histogram_variance, df)
df = removeOutlier(df.histogram_median, df)
df = removeOutlier(df.histogram_mean, df)
df = removeOutlier(df.histogram_mode, df)
df = removeOutlier(df.percentage_of_time_with_abnormal_long_term_variability, df)
df = removeOutlier(df.mean_value_of_short_term_variability, df)

In [None]:
df.shape

Correlation HeatMap

In [None]:
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,10))
g = sns.heatmap(df[top_corr_features].corr(),annot = True,cmap = "RdYlGn")

Balancing Dataset:

In [None]:
df.fetal_health.value_counts()

In [None]:
from sklearn.utils import resample

# Separate Target Classes
df_1 = df[df.fetal_health==1]
df_2 = df[df.fetal_health==2]
df_3 = df[df.fetal_health==3]
 
# Upsample minority class
df_2_upsampled = resample(df_2, 
                                 replace=True,     # sample with replacement
                                 n_samples=1601,    # to match majority class
                                 random_state=123) # reproducible results

df_3_upsampled = resample(df_3, 
                                 replace=True,     # sample with replacement
                                 n_samples=1601,    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_1, df_2_upsampled, df_3_upsampled])
 
# Display new class counts
df_upsampled.fetal_health.value_counts()

Separating Fetures and Target Variable

In [None]:
x = df_upsampled.drop('fetal_health', axis = 1)
y = df_upsampled['fetal_health'] 

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.25, random_state = 0)

Feature Scaling: Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# Models I am going to use are: 
# XGBoost
# AdaBoost
# CataBoost
# RandomForest
# LBGM Classifier
# Voting Classifier

### Evaluator Function: 
Accuracy, Precision, Recall, f1-Score, roc_auc_score and Confusion Matrix

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from mlxtend.plotting import plot_confusion_matrix

def evaluator(y_test, y_pred):    
    
    # Accuracy:
    print('Accuracy is: ', accuracy_score(y_test,y_pred))
    print('')
    # Classification Report:
    print('Classification Report: \n',classification_report(y_test,y_pred))

    # Area Under The Curve Score:

    lb = LabelBinarizer()
    y_test1 = lb.fit_transform(y_test)
    y_pred1 =lb.transform(y_pred)
    print('AUC_ROC Score: ',roc_auc_score(y_test1,y_pred1,average='macro'),'\n\n')

    print('Confusion Matrix: \n\n')
    plt.style.use("ggplot")
    cm = confusion_matrix(y_test,y_pred)
    plot_confusion_matrix(conf_mat = cm,figsize=(8,6),show_normed=True)

Building Model:

# XGBOOST

In [None]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier()
xgb_classifier.fit(x_train,y_train)

In [None]:
pred_xgb = xgb_classifier.predict(x_test)

evaluator(y_test, pred_xgb)

# AdaBoost:

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada_classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200
)

ada_classifier.fit(x_train,y_train)

In [None]:
pred_ada = ada_classifier.predict(x_test)

evaluator(y_test, pred_ada)

# CatBoost:

In [None]:
from catboost import CatBoostClassifier

cat_classifier = CatBoostClassifier(iterations=1000, verbose = 0)

cat_classifier.fit(x_train, y_train)

In [None]:
pred_cat = cat_classifier.predict(x_test)

evaluator(y_test, pred_cat)

# LBGM Classifier:

In [None]:
from lightgbm import LGBMClassifier

lgb_classifier = LGBMClassifier()
lgb_classifier.fit(x_train,y_train)

In [None]:
pred_lgb = lgb_classifier.predict(x_test)

evaluator(y_test,pred_lgb)

# Random Forest Classifier:

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()

rf_classifier.fit(x_train,y_train)

In [None]:
pred_rf = rf_classifier.predict(x_test)

evaluator(y_test, pred_rf)

Important Features

In [None]:
important_features = pd.DataFrame({'Features': x.columns, 
                                   'Importance': rf_classifier.feature_importances_})

# sort the dataframe in the descending order according to the feature importance
important_features = important_features.sort_values('Importance', ascending = False)

# create a barplot to visualize the features based on their importance
sns.barplot(x = 'Importance', y = 'Features', data = important_features)

# add plot and axes labels
# set text size using 'fontsize'
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)

# display the plot
plt.show()

# Voting Classifier:

In [None]:
from sklearn.ensemble import VotingClassifier

vc = VotingClassifier(estimators = [("xgb_classifier",xgb_classifier),('ada_classifier', ada_classifier),('cat _classifier', cat_classifier),("lgb_classifier",lgb_classifier),("rf_classifier",rf_classifier)],voting='soft')
vc.fit(x_train,y_train)

In [None]:
pred_vc = vc.predict(x_test)

evaluator(y_test, pred_vc)

## Result and Conclusion:
All models perform good except for adaboost (after balancing).

Accuracy has significantly increased by 4 to 5 percent after balancing out the data. To balance the data resampling was done by up_sampling i.e, duplicating the minority class to meet the value_count of majority class.

Random Forest Classifier is performing the best based on the evaluation matrices used.

---
### **Thank You**
---

Author: Avinash Vinayak Bagul
(MSc Artificial Intelligence)