# Gestational diabetes predictor

In [1]:
# Importing basic library

import numpy as np
import pandas as pd

# Importing the require modules from sklearn library for implementing our predictive models
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Importing joblib for saving the pipeline
import joblib

In [2]:
# Importing the gestational diabetes dataset

gdm_data = pd.read_csv('/opt/static/GDM.csv')

In [3]:
from sqlalchemy import create_engine 
engine = create_engine('postgresql://local:password@postgres:5433/postgres')
connection = engine.connect()

OperationalError: (psycopg2.OperationalError) could not connect to server: Connection refused
	Is the server running on host "postgres" (172.18.0.5) and accepting
	TCP/IP connections on port 5433?

(Background on this error at: https://sqlalche.me/e/14/e3q8)

## Exploratory data analysis (EDA)

In [3]:
# Dimensions of the dataset

gdm_data.shape

(3525, 17)

In [4]:
# Checking the newly imported dataset

gdm_data.head()

Unnamed: 0,Case Number,Age,No of Pregnancy,Gestation in previous Pregnancy,BMI,HDL,Family History,unexplained prenetal loss,Large Child or Birth Default,PCOS,Sys BP,Dia BP,OGTT,Hemoglobin,Sedentary Lifestyle,Prediabetes,Class Label(GDM /Non GDM)
0,1,22,2,1,,55.0,0,0,0,0,102.0,69,,12.0,0,0,0
1,2,26,2,1,,53.0,0,0,0,0,101.0,63,,12.4,0,0,0
2,3,29,1,0,,50.0,0,0,0,0,118.0,79,,14.3,0,0,0
3,4,28,2,1,,51.0,0,0,0,0,99.0,70,,15.0,0,0,0
4,5,21,2,1,,52.0,0,0,0,0,116.0,65,,15.0,0,0,0


In [5]:
# The dataset contains Case Number which isnt of any value in the prediction, so we can go ahead and drop it

gdm_data.drop(['Case Number'], axis = 1, inplace = True)

In [6]:
# Also lowering and removing any spaces in the columns name

gdm_data.columns = [x.lower().replace(" ", "_") for x in gdm_data.columns]
gdm_data.rename(columns={"class_label(gdm_/non_gdm)":"gdm_status"}, inplace = True)
gdm_data.head()

Unnamed: 0,age,no_of_pregnancy,gestation_in_previous_pregnancy,bmi,hdl,family_history,unexplained_prenetal_loss,large_child_or_birth_default,pcos,sys_bp,dia_bp,ogtt,hemoglobin,sedentary_lifestyle,prediabetes,gdm_status
0,22,2,1,,55.0,0,0,0,0,102.0,69,,12.0,0,0,0
1,26,2,1,,53.0,0,0,0,0,101.0,63,,12.4,0,0,0
2,29,1,0,,50.0,0,0,0,0,118.0,79,,14.3,0,0,0
3,28,2,1,,51.0,0,0,0,0,99.0,70,,15.0,0,0,0
4,21,2,1,,52.0,0,0,0,0,116.0,65,,15.0,0,0,0


In [7]:
# Printing concise summary of the dataframe

gdm_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3525 entries, 0 to 3524
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   age                              3525 non-null   int64  
 1   no_of_pregnancy                  3525 non-null   int64  
 2   gestation_in_previous_pregnancy  3525 non-null   int64  
 3   bmi                              2444 non-null   float64
 4   hdl                              2524 non-null   float64
 5   family_history                   3525 non-null   int64  
 6   unexplained_prenetal_loss        3525 non-null   int64  
 7   large_child_or_birth_default     3525 non-null   int64  
 8   pcos                             3525 non-null   int64  
 9   sys_bp                           1820 non-null   float64
 10  dia_bp                           3525 non-null   int64  
 11  ogtt                             3012 non-null   float64
 12  hemoglobin          

In [8]:
# Summary of missing values in each column

gdm_data.isna().sum()

age                                   0
no_of_pregnancy                       0
gestation_in_previous_pregnancy       0
bmi                                1081
hdl                                1001
family_history                        0
unexplained_prenetal_loss             0
large_child_or_birth_default          0
pcos                                  0
sys_bp                             1705
dia_bp                                0
ogtt                                513
hemoglobin                            0
sedentary_lifestyle                   0
prediabetes                           0
gdm_status                            0
dtype: int64

### Initial observation from the exploratory data analysis (EDA)

 * From the initial EDA, we can see that all the columns have numerical values (int or float). 
 * There are some features (columns) having missing values (NaNs). 
 
Let's explore further about which of the columns (features) are numeric features and which ones are categorical features. Furthermore, we need to check if any of the numeric features contain outliers that need to be dealt with.

In [9]:
gdm_data.describe()

Unnamed: 0,age,no_of_pregnancy,gestation_in_previous_pregnancy,bmi,hdl,family_history,unexplained_prenetal_loss,large_child_or_birth_default,pcos,sys_bp,dia_bp,ogtt,hemoglobin,sedentary_lifestyle,prediabetes,gdm_status
count,3525.0,3525.0,3525.0,2444.0,2524.0,3525.0,3525.0,3525.0,3525.0,1820.0,3525.0,3012.0,3525.0,3525.0,3525.0,3525.0
mean,32.581277,2.076312,0.960851,27.854092,46.47187,0.498156,0.364823,0.35773,0.264681,135.762088,81.538156,170.707503,13.959801,0.449645,0.29305,0.38922
std,6.169107,0.838637,0.722313,5.714737,10.800813,0.500068,0.481449,0.4794,0.441226,22.742844,11.379758,48.160549,1.863969,0.497529,0.455226,0.487643
min,20.0,1.0,0.0,13.3,15.0,0.0,0.0,0.0,0.0,90.0,60.0,80.0,8.8,0.0,0.0,0.0
25%,28.0,1.0,0.0,24.4,42.0,0.0,0.0,0.0,0.0,122.0,74.0,142.0,12.7,0.0,0.0,0.0
50%,32.0,2.0,1.0,27.5,49.0,0.0,0.0,0.0,0.0,132.0,81.0,156.0,14.0,0.0,0.0,0.0
75%,37.0,3.0,1.0,31.0,55.0,1.0,1.0,1.0,1.0,153.0,86.0,195.0,15.0,1.0,1.0,1.0
max,45.0,4.0,2.0,45.0,70.0,1.0,1.0,1.0,1.0,185.0,124.0,403.0,18.0,1.0,1.0,1.0


### a. Numerical and categorical features

From the above summary table, we can identify which features are numerical and which ones are categorical variables (features/columns) in our dataset. The numerical and categorical features have been listed below:

<u><b>Numerical features (8 features)</b></u>:
   1. age
   2. no_of_pregnancy
   3. bmi --- *body mass index*
   4. hdl --- *high density lipoprotein*
   5. sys_bp --- *systolic blood pressure* 
   6. dia_bp --- *diastolic blood presure*
   7. ogtt --- *oral glucsose tolerance test*
   8. hemoglobin

<u><b>Categorical features (7 features)</b></u>:
   1. gestation_in_previous_pregnancy
   2. family_history
   3. unexplained_prenetal_loss
   4. large_child_or_birth_default
   5. pcos --- *polycystic ovary syndrome**
   6. sedentary_lifestyle
   7. prediabetes
   
All of the categorical features are binary nominal features.

### b. Missing values
There are four numerical features having missing values: *bmi*, *hdl*, *dia_bp* and *ogtt*. These rows having these missing values cannot be removed as our data is quite limited, instead, we will try to impute these values.

### c. Outliers
The numerical features do not seem to have any outliers.

## Separating the imported data into Input and Target variables

For ML training, we have to separate the dataset into **Input** (aka features/independent variables) and **Target** (dependent variable).

The Target variable in our dataset is the *Class Label (GDM /Non GDM)* column.

- The Input variables will be stored in **X**
- The Target variable will be stored in **y**

In [10]:
# Input and TARGET
X = gdm_data.drop(['gdm_status'], axis = 1)
y = gdm_data['gdm_status']

## Splitting the Input and Target variables into Training sets and Test sets

Splitting is 80% training set and 20% test set (This is very common split ratio)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

## Pre-processing the data

At this stage, we would want to think about pre-processing our data so that our data is in proper format to be input to our ML models.

At this stage, we would want to do the following pre-processing:
- if we have categorical variables, we need to encode them into numerical values using say OneHotEncoding
- for numerical variables, we need to perform scaling to standardize the values.
- if we have missing values, we need to deal with them e.g. remove them altogether or impute the missing values
- if we have duplicate values, these do not offer any benefit in terms of prediction, rather they have an overhead
- etc.

If our feature variables have both categorical and numerical features, we need to create a separate lists of numeric and categorical features in our dataset. This is done so that we can apply separate preprocessing steps on numerical and categorical features.

In [12]:
# Numeric features
numeric_features = ['age', 'no_of_pregnancy','bmi','hdl','sys_bp','dia_bp','ogtt','hemoglobin']

# Categorical features
categorical_features = ['gestation_in_previous_pregnancy', 'family_history','unexplained_prenetal_loss',
                       'large_child_or_birth_default','pcos','sedentary_lifestyle','prediabetes']

## Setting up the pipeline

In [13]:
#-- Numerical feature transformer

numeric_transformer = Pipeline(steps = [("imputer", KNNImputer()),
                                       ("scaler", StandardScaler())])

categorical_transformer = Pipeline(steps = [("ohe", OneHotEncoder(handle_unknown = "ignore"))])

#-- Creating pre-processing pipeline

preprocessing_pipeline = ColumnTransformer(transformers = [('numeric', numeric_transformer, numeric_features),
                                                          ('categorical', categorical_transformer, categorical_features)])

Now that we have setup our preprocessing pipeline, we need to create a final pipeline. This pipeline will be responsible for receiving the data, passing it through the preprocessing pipeline, and then into our prediction models.

### Applying the pipeline

We will be applying the pipeline using two classifiers:
1. Logistic regression classifier
2. Random forest classifier

In [14]:
# Applying the pipeline to logistic regression model

clf_lg = Pipeline(steps = [('preprocessing_pipeline', preprocessing_pipeline),
                        ('classifier',LogisticRegression(random_state = 42))])

# -->Fitting the pipeline to our data
clf_lg.fit(X_train,y_train)

# --> Predicting
y_pred_class = clf_lg.predict(X_test)

# --> Finding the accuracy of our trained model
accuracy_score(y_test, y_pred_class)

0.9602836879432625

In [15]:
# Applying the pipeline to Random Forest model

clf_rf = Pipeline(steps = [('preprocessing_pipeline', preprocessing_pipeline),
                        ('classifier',RandomForestClassifier(random_state = 42))])

# -->Fitting the pipeline to our data
clf_rf.fit(X_train,y_train)

# --> Predicting
y_pred_class = clf_rf.predict(X_test)

# --> Finding the accuracy of our trained model
accuracy_score(y_test, y_pred_class)

0.9673758865248226

## Saving the model

Final step is to save the model so that we can load it from the API

In [16]:
joblib.dump(clf_rf, '/opt/static/model_rf.joblib')

['/opt/static/model_rf.joblib']