In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy.stats import skew, norm 
from warnings import filterwarnings as filt
import plotly.express as px 

plt.style.use('_classic_test_patch')
plt.rcParams['figure.figsize'] = (12,6)
filt('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('/kaggle/input/fetal-health-classification/fetal_health.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().values.sum()

In [None]:
sns.countplot(df.fetal_health)

In [None]:
sns.heatmap(df.corr(), cmap = 'icefire')

In [None]:
df.corrwith(df.fetal_health).sort_values(ascending = False)

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
import shap
from pdpbox import pdp
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.feature_selection import mutual_info_classif

def permImp(x, y):
    model = rfc().fit(x, y)
    perm = PermutationImportance(model).fit(x, y)
    return eli5.show_weights(perm , feature_names = x.columns.tolist())

def isolate(x, y, col):
    model = rfc().fit(x, y)
    pdp_dist = pdp.pdp_isolate(model, dataset = x, model_features = x.columns, feature = col)
    return pdp.pdp_plot(pdp_dist, feature_name = col)

def forceplot(x, y, n_class = 0):
    model = rfc().fit(x, y)
    explainer = shap.TreeExplainer(model)
    shap_value = explainer.shap_values[n_class]
    expected_value = explainer.expected_value[n_class]
    return shap.force_plot(expected_value, shap_value, feature_names = x.columns)

def plot_mi(score):
    score = score.sort_values('mi_score', ascending = True)
    return plt.barh(score.index, score.mi_score)

def mi_score(x, y):
    score = pd.DataFrame(mutual_info_classif(x, y, discrete_features = False), index = x.columns, columns = ['mi_score'])
    plot_mi(score)
    return score.sort_values('mi_score', ascending = False)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold

In [None]:
x = df.drop(['fetal_health'], axis = 1)
y = df.fetal_health
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
permImp(x_train, y_train)

In [None]:
mscore = mi_score(x_train, y_train)

In [None]:
sns.scatterplot(data = df, x = 'abnormal_short_term_variability', y = 'mean_value_of_short_term_variability', hue = 'fetal_health')

In [None]:
sns.scatterplot(data = df, y = 'histogram_number_of_zeroes', x = 'mean_value_of_short_term_variability', hue = 'fetal_health')

In [None]:
df.describe()

In [None]:
df.nunique()

In [None]:
skewness = pd.DataFrame(np.abs(skew(df)), columns = ['skew_score'], index = df.columns).sort_values('skew_score', ascending = True)
plt.barh(skewness.index, skewness.skew_score)
plt.title('skewness score')

In [None]:
high_skewness = skewness[skewness.skew_score > 2].index
fig, ax = plt.subplots(len(high_skewness), 2, figsize = (16, 10))
fig.tight_layout()
for ind,col in enumerate(high_skewness):
#     plt.figure(ind)
    sns.distplot(df[col], ax = ax[ind, 0])
    sns.boxplot(df[col], ax = ax[ind, 1])

In [None]:
def outliers(df, col):
    quant = df[col].quantile(q = [0.25, 0.75])
    q1 = quant.loc[0.25]
    q3 = quant.loc[0.75]
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    return df[(df[col] < lower_bound) | (df[col] > upper_bound)].index    

In [None]:
testing = df.copy()
fig, ax = plt.subplots(len(high_skewness), 2, figsize = (16, 10))
fig.tight_layout()
for ind, col in enumerate(high_skewness):
    idx = outliers(testing, col)
    testing.loc[idx, col] = testing[col].mean()
#     plt.figure(ind)
    sns.distplot(testing[col], ax = ax[ind, 0])
    sns.boxplot(testing[col], ax = ax[ind, 1])

In [None]:
sns.countplot(df.severe_decelerations, hue = df.fetal_health)

In [None]:
isolate(x_train, y_train, 'severe_decelerations')

In [None]:
def interact(x, y, cols):
    model = rfc().fit(x, y)
    pdp_dist = pdp.pdp_interact(model, dataset = x, model_features = x.columns , features = cols)
    return pdp.pdp_interact_plot(pdp_dist, feature_names = cols)

In [None]:
interact(x_train, y_train, ['severe_decelerations','fetal_movement'])

##### 0 - Normal
##### 1 - Suspect
##### 2 - Pathological

* if the fetal movement and severe deceleration is low then there's a 78% chance that the fetus is normal
* if the severe_deceleration increase and no matter how the fetal moves then there's a low chance that the fetus is not normal 

In [None]:
t_x = testing.drop(['fetal_health'], axis = 1)
t_y = testing.fetal_health
new_x_train, new_x_test, new_y_train, new_y_test = train_test_split(x, y, test_size = 0.2, stratify = y)
new_x_train.shape, new_x_test.shape, new_y_train.shape, new_y_test.shape

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRFClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_validate, KFold

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler , RobustScaler, MinMaxScaler

In [None]:
def best_model(x, y):
    xgb.set_config(verbosity=0)
    models = [SVC(), KNeighborsClassifier(), GaussianNB(), rfc(), XGBRFClassifier(), LGBMClassifier()]
    names = ['svm', 'knn', 'naive bayes', 'random forest', 'xgboost', 'lightgb']
    scores = [[] for _ in range(4)]
    for model in models:
        for idx, scaler in enumerate([None, StandardScaler(), RobustScaler(), MinMaxScaler()]):
            if scaler:
                model = Pipeline(steps = [('scaler', scaler), ('model', model)])
            #cv = StratifiedKFold(5, shuffle = True, random_state = 123)
            cv = KFold(5, shuffle = True, random_state = 123)
            score = cross_validate(model, X = x, y = y, cv = cv, scoring = 'f1_micro')['test_score'].mean()
            scores[idx].append(score)
    return pd.DataFrame(scores, columns = names, index = ['None', 'std', 'robust', 'minmax']).T

def clf_report(yt, pred):
    print()
    print(classification_report(yt,  pred))
    print()
    
def get_score(xt, yt, xtest, ytest, model, scaler = None):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])
    model.fit(xt, yt)
    pred = model.predict(xtest)
    print(' Report '.center(60,'='))
    print()
    print(f"training score  :===>  {model.score(xt, yt)}")
    print(f"testing score   :===>  {model.score(xtest, ytest)}")
    clf_report(ytest, pred)
    sns.heatmap(confusion_matrix(ytest, pred), fmt = '.1f', annot = True)
    
    
def gridcv(xt, yt, model, params, scaler = None):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])
    cv = KFold(5, shuffle = True, random_state = 123)
    clf = GridSearchCV(model, param_grid = params, cv = cv, scoring = 'f1_micro', return_train_score = True, verbose = 1)
    clf.fit(xt, yt)
    res = pd.DataFrame(clf.cv_results_).sort_values('mean_test_score', ascending = False)
    return clf.best_estimator_, clf.best_params_, res[['mean_train_score','mean_test_score','params']]
    

In [None]:
# normal x and y train
best_model(x_train, y_train)

In [None]:
# replaced all the outliers for high skewed features with their mean x and y train 
best_model(new_x_train, new_y_train)

In [None]:
from imblearn.over_sampling import SMOTE 

In [None]:
smot = SMOTE()
smot_x_train, smot_y_train = smot.fit_resample(x_train, y_train)
sns.countplot(smot_y_train)

In [None]:
# over-sampled x and y train 
best_model(smot_x_train, smot_y_train)

In [None]:
smot_x_train.shape, smot_y_train.shape

In [None]:
params = {
    'model__max_depth' : [8, 13, 16, 18, -1],
    'model__boosting_type' : ['gbdt', 'dart'],
    'model__n_estimators' : [100, 200, 300],
    'model__reg_lambda' : [0,1],
    'model__reg_alpha'  : [0.1,0.3,0.5]
}
gridcv(smot_x_train, smot_y_train, LGBMClassifier(), params, StandardScaler())

In [None]:
get_score(smot_x_train, smot_y_train, x_test, y_test, LGBMClassifier(max_depth = 8, n_estimators = 200, reg_alpha = 0.4), StandardScaler())