In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/fetal-health-classification/fetal_health.csv')
data.head()

|feature|description|
|---|---|
|baseline value| FHR baseline (beats per minute)|
|accelerations| Number of accelerations per second|
|fetal_movement| Number of fetal movements per second|
|uterine_contractions| Number of uterine contractions per second|
|light_decelerations| Number of light decelerations per second|
|severe_decelerations| Number of severe decelerations per second|
|prolongued_decelerations| Number of prolonged decelerations per second|
|abnormal_short_term_variability|Percentage of time with abnormal short term variability|

In [None]:
data['fetal_health'] = data['fetal_health'].map({1:'Normal', 2:'Suspect', 3:'Pathological'})

In [None]:
data.info()

looks like no null values

In [None]:
data.describe().T

# EDA

In [None]:
plot_data = data.groupby('fetal_health')['fetal_health'].agg(['count']).reset_index()

fig = px.pie(plot_data, values = plot_data['count'], names = plot_data['fetal_health'])

fig.update_traces(textposition = 'inside', textinfo = 'percent + label', hole = 0.5, 
                  marker = dict(colors = ['#2A3132','#336B87'], line = dict(color = 'white', width = 2)))

fig.update_layout(title_text = 'Customer<br>fetal_health', title_x = 0.5, title_y = 0.55, title_font_size = 26, 
                  title_font_family = 'Calibri', title_font_color = 'black', showlegend = False)
                  
fig.show()

In [None]:
def plot_category(feature, figsize=None):
    Normal_count = data[data['fetal_health']=='Normal'].groupby([feature]).size()
    Suspect_count = data[data['fetal_health']=='Suspect'].groupby([feature]).size()
    Pathological_count = data[data['fetal_health']=='Pathological'].groupby([feature]).size()
    labels = Normal_count.index

    x = np.arange(len(labels)) # the label locations
    width = 0.7  # the width of the bars

    if figsize:
        fig, ax = plt.subplots(figsize=figsize)
    else:
        fig, ax = plt.subplots()
    rects1 = ax.bar(x-width/3, round(Normal_count*100/data.groupby([feature]).size(), 2), 
                    width/5, label='Normal')
    rects2 = ax.bar(x-width/8, round(Suspect_count*100/data.groupby([feature]).size(), 2), 
                    width/5, label='Suspect')
    rects3 = ax.bar(x+width/8, round(Pathological_count*100/data.groupby([feature]).size(), 2), 
                    width/5, label='Pathological')
    ax.set_ylabel('Count')
    ax.set_title('Based on %s'%feature)
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=80)
    ax.legend(loc=0, bbox_to_anchor=(1, 1));

    ax.bar_label(rects1, padding=1)
    ax.bar_label(rects2, padding=1)
    ax.bar_label(rects3, padding=1)
    ax.bar_label(rects4, padding=1)

    fig.tight_layout()
    plt.show()
    
def plot_numerical(feature, figsize=None):
    fig = plt.figure(figsize=(10,6))

    sns.kdeplot(data[data['fetal_health']=='Normal'][feature])
    sns.kdeplot(data[data['fetal_health']=='Suspect'][feature])
    sns.kdeplot(data[data['fetal_health']=='Pathological'][feature])

    fig.legend(labels=['Normal', 'Suspect', 'Pathological'])
    plt.title('fetal_health based on %s'%feature)
    plt.show()
    
def plot_pie(feature):
    plot_data = data.groupby([feature, 'fetal_health'])[feature].agg({'count'}).reset_index()

    fig = px.sunburst(plot_data, path = [feature, 'fetal_health'], values = 'count', color = feature, 
                      title = 'Affect of %s on Customer fetal_health'%feature, width = 600, height = 600)

    fig.update_layout(plot_bgcolor = 'white', title_font_family = 'Calibri Black', title_font_color = '#221f1f', 
                      title_font_size = 22, title_x = 0.5)

    fig.update_traces(textinfo = 'label + percent parent')
    fig.show()

In [None]:
for feature in ['uterine_contractions', 'baseline value', 'fetal_movement']:
    plot_numerical(feature)

In [None]:
for feature in ['accelerations', 'light_decelerations', 'severe_decelerations', 'prolongued_decelerations']:
    plot_numerical(feature)

In [None]:
for feature in ['abnormal_short_term_variability', 'mean_value_of_short_term_variability', 
                'percentage_of_time_with_abnormal_long_term_variability',
                'mean_value_of_long_term_variability']:
    plot_numerical(feature)

In [None]:
for feature in ['histogram_width',
       'histogram_min', 'histogram_max', 'histogram_number_of_peaks',
       'histogram_number_of_zeroes', 'histogram_mode', 'histogram_mean',
       'histogram_median', 'histogram_variance', 'histogram_tendency']:
    plot_numerical(feature)

**Observations-**
* uterine_contractions - >0.0025 is normal while < 0.0025 is suspect
* baseline value - 125-135 is Pathological, 135+ is Suspect
* accelerations - 0.0025+ is normal
* abnormal_short_term_variability - <50 is normal
* mean_value_of_short_term_variability - <1 is suspect, 1-2 is normal & >2 is pathological
* percentage_of_time_with_abnormal_long_term_variability - 10-70 is suspect
* mean_value_of_long_term_variability - <3 is Pathological, 3-10 is suspect and 10+ is normal

In [None]:
numerical_features = ['baseline value', 'accelerations', 'fetal_movement',
       'uterine_contractions', 'light_decelerations', 'severe_decelerations',
       'prolongued_decelerations', 'abnormal_short_term_variability',
       'mean_value_of_short_term_variability',
       'percentage_of_time_with_abnormal_long_term_variability',
       'mean_value_of_long_term_variability', 'histogram_width',
       'histogram_min', 'histogram_max', 'histogram_number_of_peaks',
       'histogram_number_of_zeroes', 'histogram_mode', 'histogram_mean',
       'histogram_median', 'histogram_variance', 'histogram_tendency']

# CORRELATION

In [None]:
df = data.copy()
df['fetal_health'] = df['fetal_health'].map({'Normal':0, 'Suspect':1, 'Pathological':2})

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(round(df[numerical_features+['fetal_health']].corr(), 2), annot=True,
            mask=None, cmap='GnBu')
corr_mat = df.corr()
plt.show()

**Observations-**
* baseline value - histogram_mode, histogram_mean, histogram_median
* mean_value_of_short_term_variability - light_decelerations, histogram_variance, histogram_width

In [None]:
# Correlated Features
s = corr_mat.unstack()
so = s.sort_values(kind="quicksort").drop_duplicates()
res1 = so[so>=0.5]
print(res1)

# Analyzing features using VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Calculating VIF
vif = pd.DataFrame()
vif["variables"] = [feature for feature in numerical_features if feature not in ['histogram_median', 'histogram_min', 'histogram_max',
                                                                                'histogram_mean', 'histogram_mode']]
vif["VIF"] = [variance_inflation_factor(df[vif['variables']].values, i) for i in range(len(vif["variables"]))]
print(vif)

# Looking at Outliers

In [None]:
NumericData = data[[feature for feature in numerical_features if feature not in ['accelerations', 'fetal_movement',
                                                                                'uterine_contractions', 'light_decelerations',
                                                                                'severe_decelerations', 'prolongued_decelerations',
                                                                                'mean_value_of_short_term_variability']]]
NumericMelt = NumericData.melt()
plt.figure(figsize=(15,10))
plt.title("Boxplots for Numerical variables")
bp = sns.boxplot(x='variable', y='value', data=NumericMelt)
bp = sns.stripplot(x='variable', y='value', data=NumericMelt, jitter=True, edgecolor='gray')
bp.set_xticklabels(bp.get_xticklabels(), rotation=90)
plt.show()

In [None]:
NumericData = data[[feature for feature in ['accelerations', 'uterine_contractions', 'light_decelerations']]]
NumericMelt = NumericData.melt()
plt.figure(figsize=(15,10))
plt.title("Boxplots for Numerical variables")
bp = sns.boxplot(x='variable', y='value', data=NumericMelt)
bp = sns.stripplot(x='variable', y='value', data=NumericMelt, jitter=True, edgecolor='gray')
bp.set_xticklabels(bp.get_xticklabels(), rotation=90)
plt.show()

In [None]:
NumericData = data[[feature for feature in ['prolongued_decelerations', 'mean_value_of_short_term_variability', 'fetal_movement']]]
NumericMelt = NumericData.melt()
plt.figure(figsize=(15,10))
plt.title("Boxplots for Numerical variables")
bp = sns.boxplot(x='variable', y='value', data=NumericMelt)
bp = sns.stripplot(x='variable', y='value', data=NumericMelt, jitter=True, edgecolor='gray')
bp.set_xticklabels(bp.get_xticklabels(), rotation=90)
plt.show()

In [None]:
NumericData = data[[feature for feature in ['severe_decelerations']]]
NumericMelt = NumericData.melt()
plt.figure(figsize=(15,10))
plt.title("Boxplots for Numerical variables")
bp = sns.boxplot(x='variable', y='value', data=NumericMelt)
bp = sns.stripplot(x='variable', y='value', data=NumericMelt, jitter=True, edgecolor='gray')
bp.set_xticklabels(bp.get_xticklabels(), rotation=90)
plt.show()

In [None]:
# Percentage of outliers present in each variable
outlier_percentage = {}
for feature in numerical_features:
    tempData = data.sort_values(by=feature)[feature]
    Q1, Q3 = tempData.quantile([0.25, 0.75])
    IQR = Q3 - Q1
    Lower_range = Q1 - (1.5 * IQR)
    Upper_range = Q3 + (1.5 * IQR)
    outlier_percentage[feature] = round((((tempData<(Q1 - 1.5 * IQR)) | (tempData>(Q3 + 1.5 * IQR))).sum()/tempData.shape[0])*100,2)
outlier_percentage

# Training Model

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from xgboost import XGBClassifier

In [None]:
train_data = df.copy()
feature_cols = [feature for feature in train_data.columns if feature not in(['fetal_health'])]
print('features used- ', feature_cols)

''' Rescaling to [0,1] '''
scaler = StandardScaler()
scaler.fit(train_data[feature_cols])
train_data[feature_cols] = scaler.transform(train_data[feature_cols])

In [None]:
X = train_data[feature_cols]
y = train_data['fetal_health']

validation_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=validation_size, 
                                                    random_state=0, stratify=y)

# Model 1: Logistic Regression

In [None]:
model = LogisticRegression(class_weight={0:1, 1:5, 2:5}, max_iter=200)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_train)

print('Train metrics...')
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

y_pred = model.predict(X_test)

print('Validation metrics...')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
''' metrics on original data '''
y_pred = model.predict(train_data[feature_cols])

def make_cm(matrix, columns):
    n = len(columns)
    act = ['actual fetal_health'] * n
    pred = ['predicted fetal_health'] * n

    cm = pd.DataFrame(matrix, 
        columns=[pred, columns], index=[act, columns])
    return cm

df_matrix=make_cm(
    confusion_matrix(train_data['fetal_health'], y_pred),['Normal', 'Suspect', 'Pathological'])

display(df_matrix)
print(classification_report(train_data['fetal_health'], y_pred))

# Model 2: XGB

In [None]:
model = XGBClassifier(
    learning_rate=0.02, 
    max_depth=5,
    #min_child_weight=2, 
    n_estimators=300, 
    random_state=0, 
    #reg_lambda=2,
    #reg_alpha=0.5,
    use_label_encoder=False
)

model.fit(X_train, y_train,
          eval_metric='merror',
          verbose=False)

In [None]:
y_pred = model.predict(X_train)

print('Train metrics...')
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

y_pred = model.predict(X_test)

print('Test metrics...')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
''' metrics on original data '''
y_pred = model.predict(train_data[feature_cols])

def make_cm(matrix, columns):
    n = len(columns)
    act = ['actual fetal_health'] * n
    pred = ['predicted fetal_health'] * n

    cm = pd.DataFrame(matrix, 
        columns=[pred, columns], index=[act, columns])
    return cm

df_matrix=make_cm(
    confusion_matrix(train_data['fetal_health'], y_pred),['Normal', 'Suspect', 'Pathological'])

display(df_matrix)
print(classification_report(train_data['fetal_health'], y_pred))