### Models used are,
* RF
* KNN
* SVM 
* XGBoost
* Stacking classifier having KNN,SVM and XGBoost <br>
### Preprocessing techniques used are,
* Outlier removal using IQR
* SMOTE over-sampling to handle class imbalance
* PCA to reduce dimensionality

All models are hyper-parameter tuned to get the best out of it <br>
SVM -> 99% accuracy
RF,KNN,XGBoost, Stacking classifier -> Above 93% accuracy

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/fetal-health-classification/fetal_health.csv')
data.head()

In [None]:
data.describe()

In [None]:
data['fetal_health'] = data['fetal_health'].map({1:'Normal', 2:'Suspect', 3:'Pathological'})
data.info()

In [None]:

plot_data = data.groupby('fetal_health')['fetal_health'].agg(['count']).reset_index()

fig = px.pie(plot_data, values = plot_data['count'], names = plot_data['fetal_health'])

fig.update_traces(textposition = 'inside', textinfo = 'percent + label', hole = 0.4, 
                  marker = dict(colors = ['#2A3132','#336B87'], line = dict(color = 'white', width = 1.6)))

fig.update_layout(title_text = 'Fetal<br>Health', title_x = 0.5, title_y = 0.55, title_font_size = 26, 
                  title_font_family = 'Calibri', title_font_color = 'black', showlegend = False)
                  
fig.show()

In [None]:
data.isnull().any()

In [None]:
data.isna().any()

In [None]:
data_tmp = data.copy()
data_tmp['fetal_health'] = data_tmp['fetal_health'].map({'Normal':1, 'Suspect':2, 'Pathological':3})
numeric_data = data_tmp.select_dtypes(exclude="object")
numeric_corr = data_tmp.corr()
f,ax=plt.subplots(figsize=(25,1))
sns.heatmap(numeric_corr.sort_values(by=["fetal_health"], ascending=False).head(1), cmap="GnBu")
plt.title("Numerical features correlation with the fetal_health", weight="bold", fontsize=18)
plt.yticks(weight="bold", color="darkgreen", rotation=0)

plt.show()

In [None]:

from scipy.stats import probplot,skew
import warnings
warnings.filterwarnings("ignore")

features = ['baseline value', 'accelerations', 'fetal_movement',
       'uterine_contractions', 'light_decelerations', 'severe_decelerations',
       'prolongued_decelerations', 'abnormal_short_term_variability',
       'mean_value_of_short_term_variability',
       'percentage_of_time_with_abnormal_long_term_variability',
       'mean_value_of_long_term_variability', 'histogram_width',
       'histogram_min', 'histogram_max', 'histogram_number_of_peaks',
       'histogram_number_of_zeroes', 'histogram_mode', 'histogram_mean',
       'histogram_median', 'histogram_variance', 'histogram_tendency']
for i in features:
    fig, axes = plt.subplots(1, 3, figsize=(20,4))
    sns.distplot(data[i],kde=False, ax=axes[0])
    sns.boxplot(data[i], ax=axes[1])
    probplot(data[i], plot=axes[2])
    skew_val=round(data[i].skew(), 1)
    axes[1].set_yticklabels([])
    axes[1].set_yticks([])
    axes[0].set_title(i + " | Distplot")
    axes[1].set_title(i + " | Boxplot")
    axes[2].set_title(i + " | Probability Plot - Skew: "+str(skew_val))
    plt.show()

In [None]:
#Importing mutual information library from sklearn feature selection
from sklearn.feature_selection import mutual_info_classif,mutual_info_regression

In [None]:

#Evaluating Mutual information score for each feature    
def make_mi_scores(X,Y):
    mi_scores=mutual_info_classif(X,Y)
    mi_scores=pd.Series(mi_scores ,name="MI_scores",index=X.columns)
    mi_scores=mi_scores.sort_values(ascending=False)
    return(mi_scores)

mi_scores=make_mi_scores(data[features],data["fetal_health"])
mi_scores

In [None]:
#Plotting mutual information bar graph 
def plot_mi_scores(scores):
    scores=scores.sort_values(ascending=True)
    width=np.arange(len(scores))
    ticks=scores.index
    plt.barh(width,scores)
    plt.yticks(width,ticks,fontweight='bold',fontsize=20)
    plt.xticks(fontweight='bold',fontsize=20)
plt.figure(figsize=(12,10))
plot_mi_scores(mi_scores)

****

In [None]:
#Handling the outliers
def outlier_treatment(col):
    Q1,Q3 = np.percentile(data[col] , [25,75]) 
    print(Q1,Q3)
    IQR = Q3-Q1
    #print(IQR)
    lower_range = Q1 - (1.5 * IQR)
    upper_range = Q3 + (1.5 * IQR)
    #print(lower_range,upper_range)
    #print(col_data[col_data[col]>100])
    data.loc[data[col]>upper_range,col] = upper_range
    data.loc[data[col]<lower_range,col] = lower_range
    
cols_with_outliers = ['accelerations', 'fetal_movement',
       'uterine_contractions', 'light_decelerations', 'severe_decelerations',
       'prolongued_decelerations',
       'mean_value_of_short_term_variability',
       'percentage_of_time_with_abnormal_long_term_variability',
       'mean_value_of_long_term_variability', 'histogram_max', 'histogram_number_of_peaks',
       'histogram_number_of_zeroes', 'histogram_mode', 'histogram_mean',
       'histogram_median', 'histogram_variance']
for col in cols_with_outliers:
    outlier_treatment(col)

In [None]:
for i in features:
    fig, axes = plt.subplots(1, 3, figsize=(20,4))
    sns.distplot(data[i],kde=False, ax=axes[0])
    sns.boxplot(data[i], ax=axes[1])
    probplot(data[i], plot=axes[2])
    skew_val=round(data[i].skew(), 1)
    axes[1].set_yticklabels([])
    axes[1].set_yticks([])
    axes[0].set_title(i + " | Distplot")
    axes[1].set_title(i + " | Boxplot")
    axes[2].set_title(i + " | Probability Plot - Skew: "+str(skew_val))
    plt.show()

**Feature engineering**

In [None]:
feature_cols = data.columns.drop('fetal_health')
target = ["fetal_health"]
#model = RandomForestRegressor(n_estimators=100, random_state=0)
#my_pipeline.fit(X_train, y_train)
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler

#standsrdising the values
array = data.values
X = array[:,0:21]
Y = array[:,21]
scale= StandardScaler()
X = scale.fit_transform(X) 
# feature extraction
model = ExtraTreesClassifier(n_estimators=10)
model.fit(X, Y)
#print(model.feature_importances_,feature_cols)
feature_importance =model.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
fig,ax=plt.subplots(1,1,figsize=(7,6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, data.columns[sorted_idx])#boston.feature_names[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
# compare pca number of components with logistic regression algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot
from xgboost import XGBClassifier
 
# # get a list of models to evaluate
# def get_models():
# 	models = dict()
# 	for i in range(1,22):
# 		steps = [('pca', PCA(n_components=i)), ('m', LogisticRegression())]
# 		models[str(i)] = Pipeline(steps=steps)
# 	return models
 
# # evaluate a given model using cross-validation
# def evaluate_model(model, X, y):
# 	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# 	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# 	return scores
 
# # define dataset

# # get the models to evaluate
# models = get_models()
# # evaluate the models and store results
# results, names = list(), list()
# for name, model in models.items():
# 	scores = evaluate_model(model, X, Y)
# 	results.append(scores)
# 	names.append(name)
# #	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# # plot model performance for comparison
# pyplot.boxplot(results, labels=names, showmeans=True)
# pyplot.xticks(rotation=45)
# pyplot.show()

## Principal Component Analysis

In [None]:
from sklearn.decomposition import KernelPCA,PCA

# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
	return scores
# get a list of models to evaluate
def get_models(alg,pca):
    models = dict()
    for i in range(1,22):
        if pca=='pca':
            steps = [('pca', PCA(n_components=i)), ('m', alg)]
            models[str(i)] = Pipeline(steps=steps)
        elif pca=='kpca':
            steps = [('pca', KernelPCA(n_components=i,kernel='rbf')), ('m', alg)]
            models[str(i)] = Pipeline(steps=steps)
    return models

def do_pca(alg,pca):
    models = get_models(alg,pca)
    # evaluate the models and store results
    results, names = list(), list()
    for name, model in models.items():
        scores = evaluate_model(model, X, Y)
        results.append(scores)
        names.append(name)
        print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
    # plot model performance for comparison
    pyplot.boxplot(results, labels=names, showmeans=True)
    pyplot.xticks(rotation=45)
    pyplot.show()
    
do_pca(LogisticRegression(),'pca')

In [None]:
do_pca(LogisticRegression(),'kpca')

After PCA, dimensionality can be reduced from 21 to 18 by keeping all the information

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import learning_curve
## Train Test Split without upsampling
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state=10)

## Train Test Split after upsampling
sm=SMOTE()
X_train_smote,y_train_smote=sm.fit_resample(X,Y)
X_train, X_test, y_train, y_test = train_test_split(X_train_smote, y_train_smote, test_size = 0.25, random_state=10)


In [None]:
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV #Paramterizers
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix #Accuracy metrics
import itertools #Used for iterations
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Plot learning curve
def plot_learning_curve(estimator, title, x, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
        
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="#80CBC4",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="#00897B",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt
def plot_cm(y_test,y_pred):
    ax= plt.subplot()
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, ax = ax, cmap = "BuGn");
    # labels, title and ticks
    ax.set_xlabel("Predicted labels");
    ax.set_ylabel("True labels"); 
    ax.set_title("Confusion Matrix"); 
    ax.xaxis.set_ticklabels(["Normal", "Suspect", "Pathological"]);
    ax.yaxis.set_ticklabels(["Normal", "Suspect", "Pathological"]);
    print("Classification Report")
    print(classification_report(y_test, y_pred))

#Using Random Forest
def model_RF(X,x_test,y,y_test):
    rf_params = {"n_estimators" :[100], 
                 "max_features": [18], 
                 "min_samples_split": [18],
                "max_depth": [5]}
    rf_model = RandomForestClassifier(random_state = 12345)
    gs_cv = GridSearchCV(rf_model, 
                        rf_params,
                        cv = 5,
                        n_jobs = -1,
                        verbose = 2).fit(X, y)
    gs_cv.best_params_
    rf_tuned = RandomForestClassifier(**gs_cv.best_params_)
    scores_knn= cross_val_score(rf_tuned, X, y, cv = 5)
    print(f"CrossValMeans: {round(scores_knn.mean(), 3)}")
    print(f"CrossValStandard Deviation: {round(scores_knn.std(), 3)}")
    rf_tuned.fit(X, y)
    plot_cm(y_test,rf_tuned.predict(x_test))
    print("Best parameters")
    print(gs_cv.best_params_)
    plot_learning_curve(gs_cv.best_estimator_,title = "Learning curve", x = X, y = y, cv = 5);
    
    return gs_cv.best_params_, rf_tuned

In [None]:
#rf_params, rf_tuned = model_RF(X_train, X_test, y_train, y_test)

In [None]:

#Using KNN
def model_KNN(X,x_test,y,y_test):
    params = {'n_neighbors':[5,6,7,8,9,10],
          'leaf_size':[1,2,3,5],
          'weights':['uniform', 'distance'],
          'algorithm':['auto', 'ball_tree','kd_tree','brute'],
          'n_jobs':[-1]}
    model = KNeighborsClassifier(n_jobs=-1)
    gs_cv = GridSearchCV(model, 
                        params,
                        cv = 5,
                        n_jobs = -1,
                        verbose = 2).fit(X, y)
    gs_cv.best_params_
    tuned = KNeighborsClassifier(**gs_cv.best_params_)
    scores_knn= cross_val_score(tuned, X, y, cv = 5)
    print(f"CrossValMeans: {round(scores_knn.mean(), 3)}")
    print(f"CrossValStandard Deviation: {round(scores_knn.std(), 3)}")
    tuned.fit(X, y)
    plot_cm(y_test,tuned.predict(x_test))
    print("Best parameters")
    print(gs_cv.best_params_)
    plot_learning_curve(gs_cv.best_estimator_,title = "Learning curve", x = X, y = y, cv = 5);
    return gs_cv.best_params_, tuned

In [None]:
#knn_params, knn_tuned = model_KNN(X_train, X_test, y_train, y_test)

In [None]:
#Using KNN
def model_XGB(X,x_test,y,y_test):
    params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01,0.1],
    'booster': ['gbtree'],
    'gamma': [0.5, 1],
    'reg_alpha': [0.5, 1],
    'reg_lambda': [0.5, 1],
    'base_score': [0.2, 0.5]
    }
    model = XGBClassifier()
    gs_cv = RandomizedSearchCV(model, 
                        params,
                        cv = 3,
                        n_jobs = -1,
                        verbose = 2).fit(X, y)
    gs_cv.best_params_
    tuned = XGBClassifier(**gs_cv.best_params_)
    scores_knn= cross_val_score(tuned, X, y, cv = 3)
    print(f"CrossValMeans: {round(scores_knn.mean(), 3)}")
    print(f"CrossValStandard Deviation: {round(scores_knn.std(), 3)}")
    tuned.fit(X, y)
    plot_cm(y_test,tuned.predict(x_test))
    print("Best parameters")
    print(gs_cv.best_params_)
    plot_learning_curve(gs_cv.best_estimator_,title = "Learning curve", x = X, y = y, cv = 3);
    return gs_cv.best_params_, tuned

In [None]:
#xgb_params, xgb_tuned = model_XGB(X_train, X_test, y_train, y_test)

In [None]:
from sklearn.svm import SVC
def model_SVC(X,x_test,y,y_test):
    params = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
    model = SVC()
    gs_cv = GridSearchCV(model, 
                        params,
                        cv = 5,
                        n_jobs = -1,
                        verbose = 2).fit(X, y)
    gs_cv.best_params_
    tuned = SVC(**gs_cv.best_params_)
    scores_knn= cross_val_score(tuned, X, y, cv = 5)
    print(f"CrossValMeans: {round(scores_knn.mean(), 3)}")
    print(f"CrossValStandard Deviation: {round(scores_knn.std(), 3)}")
    tuned.fit(X, y)
    plot_cm(y_test,tuned.predict(x_test))
    print("Best parameters")
    print(gs_cv.best_params_)
    plot_learning_curve(gs_cv.best_estimator_,title = "Learning curve", x = X, y = y, cv = 5);
    return gs_cv.best_params_, tuned

In [None]:
#svc_params, svc_tuned = model_SVC(X_train, X_test, y_train, y_test)

In [None]:
# from sklearn.ensemble import StackingClassifier
# level0 = list()
# level0.append(('svc', SVC(**svc_params)))
# level0.append(('knn', KNeighborsClassifier(**knn_params)))
# level0.append(('xgb', XGBClassifier(**xgb_params)))
# # define meta learner model
# level1 = XGBClassifier(**xgb_params)
# # define the stacking ensemble
# model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
# # fit the model on all available data
# scores_knn= cross_val_score(model, X_train, y_train, cv = 5)
# print(f"CrossValMeans: {round(scores_knn.mean(), 3)}")
# print(f"CrossValStandard Deviation: {round(scores_knn.std(), 3)}")
# model.fit(X_train, y_train)
# plot_cm(y_test,model.predict(X_test))
# plot_learning_curve(model,title = "Learning curve", x = X_train, y = y_train, cv = 5);

## Model training after PCA

Here, RF, KNN, SVM, XGBoost and stacking classifier will be trained on the SMOTE over-sampled data with 18 components after PCA

In [None]:

import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)

In [None]:
## Train Test Split after upsampling
sm=SMOTE()
X_train_smote,y_train_smote=sm.fit_resample(X,Y)

X_train, X_test, y_train, y_test = train_test_split(X_train_smote, y_train_smote, test_size = 0.25, random_state=10)
pca = PCA(n_components=18)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [None]:
rf_params, rf_tuned = model_RF(X_train, X_test, y_train, y_test)

In [None]:
knn_params, knn_tuned = model_KNN(X_train, X_test, y_train, y_test)

In [None]:
xgb_params, xgb_tuned = model_XGB(X_train, X_test, y_train, y_test)

In [None]:
svc_params, svc_tuned = model_SVC(X_train, X_test, y_train, y_test)

In [None]:
from sklearn.ensemble import StackingClassifier
level0 = list()
level0.append(('svc', SVC(**svc_params)))
level0.append(('knn', KNeighborsClassifier(**knn_params)))
level0.append(('xgb', XGBClassifier(**xgb_params)))
# define meta learner model
level1 = XGBClassifier(**xgb_params)
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
# fit the model on all available data
scores_knn= cross_val_score(model, X_train, y_train, cv = 5)
print(f"CrossValMeans: {round(scores_knn.mean(), 3)}")
print(f"CrossValStandard Deviation: {round(scores_knn.std(), 3)}")
model.fit(X_train, y_train)
plot_cm(y_test,model.predict(X_test))
plot_learning_curve(model,title = "Learning curve", x = X_train, y = y_train, cv = 5);