In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Contents
* EDA
* Dimension Reduction
* Modelling

In [None]:
!pip install seaborn==0.11.0

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import math

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
fetal_health_df = pd.read_csv('/kaggle/input/fetal-health-classification/fetal_health.csv')

In [None]:
fetal_health_df.head()

In [None]:
fetal_health_df.columns

## EDA

In [None]:
fetal_health_df.isnull().sum()

In [None]:
plt.figure(figsize=(21,21))
for i, col in enumerate(['baseline value', 'accelerations', 'fetal_movement',
       'uterine_contractions', 'light_decelerations', 'severe_decelerations',
       'prolongued_decelerations', 'abnormal_short_term_variability',
       'mean_value_of_short_term_variability',
       'percentage_of_time_with_abnormal_long_term_variability',
       'mean_value_of_long_term_variability', 'histogram_width',
       'histogram_min', 'histogram_max', 'histogram_number_of_peaks',
       'histogram_number_of_zeroes', 'histogram_mode', 'histogram_mean',
       'histogram_median', 'histogram_variance', 'histogram_tendency']):
    plt.subplot(5,5,i+1)
    sns.axes_style()
    sns.set_style("darkgrid", {"axes.facecolor": ".9"})
    sns.boxplot(data=fetal_health_df,x=col)    
    plt.tight_layout()

**Insights**
* No null values, thats great
* I don't think there are any an outlier, In *histogram_variance* column there are few values which looks like outliers but not very extreme values. Lets find more about it.

In [None]:
fig, ax =plt.subplots(1,2,figsize=(25,5))
sns.kdeplot(fetal_health_df['fetal_movement'],ax=ax[0],hue=fetal_health_df['fetal_health'],multiple="stack");
sns.kdeplot(fetal_health_df["accelerations"],ax=ax[1],hue=fetal_health_df['fetal_health'],multiple="stack");

In [None]:
fig, ax =plt.subplots(1,2,figsize=(25,5))
sns.kdeplot(fetal_health_df['baseline value'],ax=ax[0],hue=fetal_health_df['fetal_health'],multiple="stack");
sns.kdeplot(fetal_health_df["uterine_contractions"],ax=ax[1],hue=fetal_health_df['fetal_health'],multiple="stack");

In [None]:
fig, ax =plt.subplots(1,2,figsize=(25,5))
sns.kdeplot(fetal_health_df['light_decelerations'],ax=ax[0],hue=fetal_health_df['fetal_health'],multiple="stack");
sns.kdeplot(fetal_health_df["abnormal_short_term_variability"],ax=ax[1],hue=fetal_health_df['fetal_health'],multiple="stack");

In [None]:
fig, ax =plt.subplots(1,2,figsize=(25,5))
sns.kdeplot(fetal_health_df['mean_value_of_short_term_variability'],ax=ax[0],hue=fetal_health_df['fetal_health'],multiple="stack");
sns.kdeplot(fetal_health_df["histogram_tendency"],ax=ax[1],hue=fetal_health_df['fetal_health'],multiple="stack");

**Insights** <br>
* Features uterine_contractions, abnormal_short_term_variability and mean_value_of_short_term_variability can be useful classification because I think these can distinguish the class.
* For these features, we see that region of fetal health can be clustered, not completely but to an extent.

In [None]:
fig, ax =plt.subplots(1,2,figsize=(25,5))
sns.scatterplot(data=fetal_health_df,x='prolongued_decelerations',y='uterine_contractions',hue='fetal_health',palette="deep",s=100,ax=ax[0]);
sns.scatterplot(data=fetal_health_df,x='prolongued_decelerations',y='abnormal_short_term_variability',hue='fetal_health',palette="deep",s=100,ax=ax[1]);

In [None]:
fig, ax =plt.subplots(1,2,figsize=(25,5))
sns.scatterplot(data=fetal_health_df,x='prolongued_decelerations',y='uterine_contractions',hue='fetal_health',palette="deep",s=100,ax=ax[0]);
sns.scatterplot(data=fetal_health_df,x='prolongued_decelerations',y='fetal_movement',hue='fetal_health',palette="deep",s=100,ax=ax[1]);

In [None]:
fig, ax =plt.subplots(1,2,figsize=(25,5))
sns.scatterplot(data=fetal_health_df,x='prolongued_decelerations',y='abnormal_short_term_variability',hue='fetal_health',palette="deep",s=100,ax=ax[0]);
sns.scatterplot(data=fetal_health_df,x='prolongued_decelerations',y='mean_value_of_short_term_variability',hue='fetal_health',palette="deep",s=100,ax=ax[1]);

**Insights**<br>
* From the above feature graphs we see that *fetal Pathological* can be can be distinguish, with some error ofcourse, but class *Suspect* is not easily separated.

In [None]:
fig, ax =plt.subplots(1,2,figsize=(25,5))
sns.scatterplot(data=fetal_health_df,x='abnormal_short_term_variability',y='baseline value',hue='fetal_health',palette="deep",s=100,ax=ax[0]);
sns.scatterplot(data=fetal_health_df,x='abnormal_short_term_variability',y='accelerations',hue='fetal_health',palette="deep",s=100,ax=ax[1]);

In [None]:
fig, ax =plt.subplots(1,2,figsize=(25,5))
sns.scatterplot(data=fetal_health_df,x='abnormal_short_term_variability',y='histogram_width',hue='fetal_health',palette="deep",s=100,ax=ax[0]);
sns.scatterplot(data=fetal_health_df,x='abnormal_short_term_variability',y='histogram_min',hue='fetal_health',palette="deep",s=100,ax=ax[1]);

In [None]:
fig, ax =plt.subplots(1,2,figsize=(25,5))
sns.scatterplot(data=fetal_health_df,x='abnormal_short_term_variability',y='histogram_number_of_zeroes',hue='fetal_health',palette="deep",s=100,ax=ax[0]);
sns.scatterplot(data=fetal_health_df,x='abnormal_short_term_variability',y='histogram_mode',hue='fetal_health',palette="deep",s=100,ax=ax[1]);

**Insights**<br>
* In the plot, *abnormal_short_term_variability* vs *baseline value* there is small cluster formed after abnormal_short_term_variability value 50 and baseline value value greater then 130. clearly *Suspect* cluster is formed.
* Similarly with the features *abnormal_short_term_variability* vs *histogram_min* similar cluster is formed.
* Features abnormal_short_term_variability, histogram_min, histogram_mode, prolongued_decelerations and uterine_contractions can be useful for classification.

### Let look into distribution of these features

In [None]:
fig, ax =plt.subplots(1,2,figsize=(25,5))
sns.violinplot(data=fetal_health_df,y='abnormal_short_term_variability',x='fetal_health',palette="deep",ax=ax[0]);
sns.violinplot(data=fetal_health_df,y='histogram_mode',x='fetal_health',palette="deep",ax=ax[1]);

In [None]:
fig, ax =plt.subplots(1,2,figsize=(25,5))
sns.violinplot(data=fetal_health_df,y='prolongued_decelerations',x='fetal_health',palette="deep",ax=ax[0]);
sns.violinplot(data=fetal_health_df,y='histogram_min',x='fetal_health',palette="deep",ax=ax[1]);

In [None]:
fig, ax =plt.subplots(figsize=(25,5))
sns.violinplot(data=fetal_health_df,y='uterine_contractions',x='fetal_health',palette="deep");

**Insights**
* In feature *abnormal_short_term_variability* majority of the suspected and Pathological points are around 60 and greater than 60.
* In feature *histogram_mode*  it is little difficult seperate normal and suspected as they are in almost same distribution.
* In feature prolongued_decelerations, Normal and suspected values are around 0, but Pathological are distributed.
* Again I don't see any specific pattern in *histogram_min*
* In feature *uterine_contractions*, Most of the points in suspected and Pathological lies around 0, Most important is that there is little proper sepration for suspected class.

## Dimension reduction

**t-SNE**

In [None]:
from sklearn.manifold import TSNE

In [None]:
x = fetal_health_df[['baseline value', 'accelerations', 'fetal_movement',
       'uterine_contractions', 'light_decelerations', 'severe_decelerations',
       'prolongued_decelerations', 'abnormal_short_term_variability',
       'mean_value_of_short_term_variability',
       'percentage_of_time_with_abnormal_long_term_variability',
       'mean_value_of_long_term_variability', 'histogram_width',
       'histogram_min', 'histogram_max', 'histogram_number_of_peaks',
       'histogram_number_of_zeroes', 'histogram_mode', 'histogram_mean',
       'histogram_median', 'histogram_variance', 'histogram_tendency']]
y = fetal_health_df['fetal_health'].values

In [None]:
tsne = TSNE(n_components=2, perplexity=90, learning_rate=200)
X_embedding = tsne.fit_transform(x)

for_tsne = np.hstack((X_embedding, y.reshape(-1,1)))
for_tsne_df = pd.DataFrame(data=for_tsne, columns=['Dimension_x','Dimension_y','Score'])
fig, ax =plt.subplots(figsize=(25,5))
sns.scatterplot(data=for_tsne_df,x='Dimension_x',y='Dimension_y',hue='Score',palette="deep");

Well well well, t-SNE is giving good results, we can see clusters are differentiable.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
x = fetal_health_df[['baseline value', 'accelerations', 'fetal_movement',
       'uterine_contractions', 'light_decelerations', 'severe_decelerations',
       'prolongued_decelerations', 'abnormal_short_term_variability',
       'mean_value_of_short_term_variability',
       'percentage_of_time_with_abnormal_long_term_variability',
       'mean_value_of_long_term_variability', 'histogram_width',
       'histogram_min', 'histogram_max', 'histogram_number_of_peaks',
       'histogram_number_of_zeroes', 'histogram_mode', 'histogram_mean',
       'histogram_median', 'histogram_variance', 'histogram_tendency']].values
y = fetal_health_df['fetal_health'].values
x = StandardScaler().fit_transform(x)

In [None]:
pca = PCA().fit(x)
fig, ax =plt.subplots(figsize=(15,5))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

I think 15 componets would be fine for training, since almost 98% of variance is preserved. We have reduced from 21 features to 15 features.

## Modeling

In [None]:
from sklearn.metrics import plot_confusion_matrix

In [None]:
def evaluation_stats(model,X_train, X_test, y_train, y_test,algo,is_feature=False):
    print('Train classification report')
    y_pred_train = model.predict(X_train)                           
    print(classification_report(y_train, y_pred_train))
    print('Test classification report')
    y_pred_test = model.predict(X_test)                           
    print(classification_report(y_test, y_pred_test))
    print("\n")
    print("Train confusion matrix")
    print(confusion_matrix(y_train, y_pred_train))
    print("Test confusion_matrix")
    print(confusion_matrix(y_test, y_pred_test))
    
    if is_feature:
        plot_feature_importance(rf_model.feature_importances_,X.columns,algo)

def training(model,X_train, y_train):
    return model.fit(X_train, y_train)

def plot_feature_importance(importance,names,model_type):
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
fetal_health_df.columns

In [None]:
X = fetal_health_df.drop(["fetal_health"], axis=1)
y = fetal_health_df["fetal_health"]

**Splitting train and test**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 101)

**Oversampling**

In [None]:
sm = SMOTE(random_state=101)
X_res, y_res = sm.fit_resample(X_train, y_train)

**LogisticRegression**

In [None]:
C = [10**x for x in range(-5,5)]
tuned_parameters = [{'C': C}]
standardscaler=StandardScaler()
X_res_scale=standardscaler.fit_transform(X_res)
X_test_scale=standardscaler.transform(X_test)

model = GridSearchCV(LogisticRegression(), tuned_parameters, scoring = 'accuracy', cv=3,return_train_score = True)
model.fit(X_res_scale, y_res);

print(model.best_estimator_)

In [None]:
results = pd.DataFrame.from_dict(model.cv_results_)
results = results.sort_values(['param_C'])

train_acc= results['mean_train_score']
train_acc_std= results['std_train_score']
cv_acc = results['mean_test_score'] 
cv_acc_std= results['std_test_score']
C =  results['param_C']

C = [math.log(x,10) for x in C]
plt.plot(C, train_acc, label='Train Acc')

plt.plot(C, cv_acc, label='CV Acc')

plt.scatter(C, train_acc, label='Train points')
plt.scatter(C, cv_acc, label='CV points')

plt.legend()
plt.xlabel("C: hyperparameter")
plt.ylabel("ACC")
plt.title("Error Plot")
plt.grid()
plt.show()

In [None]:
lr_model = training(LogisticRegression(C=10),X_res, y_res)
evaluation_stats(lr_model,X_res_scale, X_test_scale, y_res, y_test,'LogisticRegression',is_feature=False)

**RandomForestClassifier**

In [None]:
params = {
        'n_estimators': [200, 500, 1000],
        'max_depth': [2,4, 5, 8]
        }
model = GridSearchCV(RandomForestClassifier(), params, scoring = 'accuracy', cv=3,return_train_score = True)
model.fit(X_res_scale, y_res)

print(model.best_estimator_)

In [None]:
rf_model = training(RandomForestClassifier(max_depth=8, n_estimators=1000),X_res, y_res)
evaluation_stats(rf_model,X_res, X_test, y_res, y_test,'RandomForestClassifier',is_feature=True)

**XGBClassifier**

In [None]:
params = {
        'n_estimators': [200, 500, 1000],
        'max_depth': [2,4, 5, 8,10]
        }
model = GridSearchCV(XGBClassifier(), params, scoring = 'accuracy', cv=3,return_train_score = True)
model.fit(X_res_scale, y_res)

print(model.best_estimator_)

In [None]:
xbg_model = training(XGBClassifier(max_depth=8,n_estimators=200),X_train,y_train)
evaluation_stats(xbg_model,X_train, X_test, y_train, y_test,'XGB',is_feature=True)

**Insights** <br>
We tried LR, RF and XBG
* LR gave train and test accuracy around 63% and 66%, F1_score was very less for suspected and pathological in test.
* RF gave train and test accuracy around 98% and 93%, F1 score is also good and few features which we found as important can be seen in feature importance graph. My only concern here is that model should not be over fitting.
* XBG is overfitting I believe, Lets not think about it.