In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**IMPORT STATEMENTS**

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")

from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
from sklearn import model_selection

**Split into Training and Testing dataset**

In [None]:
#create 3 stratified folds so that each fold contains roughly 33% data
#since it's stratified, we will have uniform distribution of target categories across folds
#keep one fold aside as testing dataset
def create_test():
    df = pd.read_csv("../input/fetal-health-classification/fetal_health.csv")
    df["kfold"] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    y = df.fetal_health.values
    kf = model_selection.StratifiedKFold(n_splits=3)
    for fold, (trn_, val_) in enumerate(kf.split(X=df, y=y)):
        df.loc[val_, "kfold"] = fold
    
    df_train = df[df.kfold != 0].reset_index(drop = True)
    df_train.drop("kfold",axis = 1, inplace = True)
    df_train.to_csv("../dataset/train.csv", index=False)
    
    df_test = df[df.kfold == 0].reset_index(drop = True)
    df_test.drop("kfold",axis = 1, inplace = True)
    df_test.to_csv("../dataset/test.csv", index=False)
    print("Test Dataset created")
    

**Split the Training data into 5 folds for kfold validation**

In [None]:
#now take the train dataset that you have created 
#and split it into 5 stratified folds (k=5)
#while training we will use kfold validation, i.e, train on 4 folds and validate on the remaining 1

def create_folds():
    df = pd.read_csv("../dataset/train.csv")
    df["kfold"] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    y = df.fetal_health.values
    kf = model_selection.StratifiedKFold(n_splits=5)
    for fold, (trn_, val_) in enumerate(kf.split(X=df, y=y)):
        df.loc[val_, "kfold"] = fold
    
    df.to_csv("../dataset/train_folds.csv", index=False)
    print("Training Dataset with 5 folds created")
    

In [None]:
#call the methods in succession
if not os.path.isdir('../dataset/'):
    print("Creating dircetory dataset")
    os.makedirs('../dataset/')
else:
    print("Dircetory dataset already exists")
!ls ..
!pwd
create_test()
create_folds()

**Create Baseline Score With Logistic Regression**

In [None]:
def run(fold):

    df = pd.read_csv("../dataset/train_folds.csv")

    features = [f for f in df.columns if f not in ("fetal_health", "kfold")]

    df_train = df[df.kfold != fold].reset_index(drop = True)
    
    df_valid = df[df.kfold == fold].reset_index(drop = True)

    x_train = df_train[features]
    
    x_valid = df_valid[features]
    
    scaler = preprocessing.StandardScaler()
    
    scaler.fit(x_train)
    
    x_train_scaled = scaler.transform(x_train)
    
    x_valid_scaled = scaler.transform(x_valid)
    
    model = linear_model.LogisticRegression(multi_class = 'multinomial', solver = 'saga')
    
    model.fit(x_train_scaled, df_train.fetal_health.values)
    
    valid_preds = model.predict_proba(x_valid_scaled)
    
    auc = metrics.roc_auc_score(pd.get_dummies(df_valid.fetal_health.values), valid_preds)
    
    print("Fold = {}, AUC = {}".format(fold,auc))
    
for fold_ in range(5):
        run(fold_)

In [None]:
#This gives a good enough model
#Let's train on the entire training set and create the model

df_train = pd.read_csv("../dataset/train_folds.csv")

features = [f for f in df_train.columns if f not in ("fetal_health", "kfold")]

x_train = df_train[features]

scaler = preprocessing.StandardScaler()

scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)

model = linear_model.LogisticRegression(multi_class = 'multinomial', solver = 'saga')

model.fit(x_train_scaled, df_train.fetal_health.values)


**Testing Time**

In [None]:
df_test = pd.read_csv("../dataset/test.csv")

features = [f for f in df_test.columns if f not in ("fetal_health", "kfold")]

x_test = df_test[features]

x_test_scaled = scaler.transform(x_test)

test_preds = model.predict(x_test_scaled)
test_proba = model.predict_proba(x_test_scaled)
    
auc = metrics.roc_auc_score(pd.get_dummies(df_test.fetal_health.values), test_proba)
f1_score = metrics.f1_score(df_test.fetal_health.values, test_preds, average='weighted')

print("Testing AUC = {}".format(auc))
print("Testing f1-Score = {}".format(f1_score))