In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [None]:
# Import Dataset
url = "https://raw.githubusercontent.com/sanjida-akhtar/maternal-health-analysis/main/data/maternal_health.csv"
df = pd.read_csv(url)
df.head()

In [None]:
# Copy the dataset
data = df.copy()
data.head()

In [None]:
# Rename columns
data.rename(columns = {"District " : "District"}, inplace = True)

In [None]:
# Drop unncessary columns
data = data.drop(columns = ["Division","Total", "ANC1(%)", "ANC2(%)", "ANC3(%)", "ANC4(%)", "NVD(%)", "Ceasarean(%)","Total Death", "Maternal Death(%)",
                           "Maternal Death Review", "Total PNC", "PNC1(%)", "PNC2(%)"], axis = 1)

In [None]:
# Display the shape of dataset
data.shape

In [None]:
# Display basic information of dataset
data.info()

In [None]:
# Display statistical summary of the dataset
data.describe(include="all")

In [None]:
# Check for data types of all variables
data.dtypes

In [None]:
# Label Encoding on District variable
le = LabelEncoder()
data["District"] = le.fit_transform(data["District"])

In [None]:
# Display number of values
data["District"].value_counts()

In [None]:
# Label encoding on Month variable
la = LabelEncoder()
data["Month"] = la.fit_transform(data["Month"])

In [None]:
# Display number of values in Month variable
data["Month"].value_counts()

In [None]:
# Create a list to hold values
arr = data["Maternal Death"].values
li = list(arr)
def death_discretization(data):
  """
  Discretizes values of a list, 0 represents No Death,
  1 represents Low number of Death, 2 represents Medium number of Death and
  3 represents High number of Death
  data : Parameter represents a list.Return value.
  """
  for x in range(len(data)):
    if data[x] == 0:
       data[x] = 0        # No death
    elif data[x] < 25:
       data[x] = 1        # Low Death
    elif data[x] < 50:
       data[x] = 2        # Medium Death
    else:
       data[x] = 3        # High Death
  return data

# Create Maternal Death Status column to hold discretized values
data["Maternal Death Category"] = death_discretization(arr)

In [None]:
# Set data type to integer
data["Maternal Death Category"] = data["Maternal Death Category"].astype(int)

In [None]:
# Display the number of each values
data["Maternal Death Category"].value_counts()

In [None]:
# Plot a countplot
sns.countplot(x = data["Year"], data = data)
plt.xlabel("Year")
plt.title("Histogram of ANC1 variable")
plt.show()

In [None]:
# Plot a boxplot
sns.boxplot(x = data["District"], data = data)
plt.xlabel("Year")
plt.title("boxplot of District variable")
plt.tight_layout()
plt.show()

In [None]:
# Plot a boxplot
sns.boxplot(x = data["Month"], data = data)
plt.xlabel("Year")
plt.title("boxplot of Month variable")
plt.tight_layout()
plt.show()

In [None]:
# Display the histogram of ANC1
sns.histplot(x = data["ANC1"], data = data)
plt.xlabel("ANC1")
plt.title("Histogram of ANC1 variable")
plt.show()

In [None]:
# Display the histogram of ANC2
sns.histplot(x = data["ANC2"], data = data)
plt.xlabel("ANC2")
plt.title("Histogram of ANC2 variable")
plt.show()

In [None]:
# Display the histogram of ANC3
sns.histplot(x = data["ANC3"], data = data)
plt.xlabel("ANC3")
plt.title("Histogram of ANC3 variable")
plt.show()

In [None]:
# Display the histogram of ANC4
sns.histplot(x = data["ANC4"], data = data)
plt.xlabel("ANC4")
plt.title("Histogram of ANC4 variable")
plt.show()

In [None]:
# Display the histogram of Delivery
sns.histplot(x = data["Delivery"], data = data)
plt.xlabel("Delivery")
plt.title("Histogram of Delivery variable")
plt.show()

In [None]:
# Display the histogram of Ceasarean
sns.histplot(x = data["Ceasarean"], data = data)
plt.xlabel("Ceasarean")
plt.title("Histogram of Ceasarean variable")
plt.show()

In [None]:
# Display the histogram of NVD
sns.histplot(x = data["NVD"], data = data)
plt.xlabel("NVD")
plt.title("Histogram of NVD variable")
plt.show()

In [None]:
# Display the histogram of PNC1
sns.histplot(x = data["PNC1"], data = data)
plt.xlabel("Post natal care-1")
plt.title("Histogram of PNC1 variable")
plt.show()

In [None]:
# Display the histogram of PNC2
sns.histplot(x = data["PNC2"], data = data)
plt.xlabel("Post natal care-2")
plt.title("Histogram of PNC2 variable")
plt.show()

In [None]:
# Bivariate EDA

# Display barplot of ANC1
sns.barplot(  x= data["Maternal Death Category"],y = data["ANC1"], data = data)
plt.ylabel("Antenatal Care-1")
plt.xlabel("Maternal Death Category")
plt.title("Barplot of ANC and Maternal Death Category")
plt.show()

In [None]:
# Display barplot
sns.barplot(  x= data["Maternal Death Category"],y = data["ANC2"], data = data)
plt.ylabel("Antenatal Care-2")
plt.xlabel("Maternal Death Category")
plt.title("Barplot of ANC2 and Maternal Death Category")
plt.show()

In [None]:
# Display barplot
sns.barplot(  x= data["Maternal Death Category"],y = data["ANC3"], data = data)
plt.ylabel("Antenatal Care-3")
plt.xlabel("Maternal Death Category")
plt.title("Barplot of ANC3 and Maternal Death Category")
plt.show()

In [None]:
# Display barplot
sns.barplot(  x= data["Maternal Death Category"],y = data["ANC4"], data = data)
plt.ylabel("Antenatal Care-4")
plt.xlabel("Maternal Death Category")
plt.title("Barplot of ANC4 and Maternal Death Category")
plt.show()

In [None]:
# Display barplot
sns.barplot(  x= data["Maternal Death Category"],y = data["Delivery"], data = data)
plt.ylabel("Delivery")
plt.xlabel("Maternal Death Category")
plt.title("Barplot of Delivery and Maternal Death Category")
plt.show()

In [None]:
# Display barplot
sns.barplot( x= data["Maternal Death Category"],y = data["NVD"], data = data)
plt.ylabel("Normal Vaginal Delivery")
plt.xlabel("Maternal Death Category")
plt.title("Barplot of NVD and Maternal Death Category")
plt.show()

In [None]:
# Display barplot
sns.barplot( x= data["Maternal Death Category"],y = data["Ceasarean"], data = data)
plt.ylabel("Ceasarean")
plt.xlabel("Maternal Death Category")
plt.title("Barplot of Ceasarean and Maternal Death Category")
plt.show()

In [None]:
# Display barplot
sns.barplot(  x= data["Maternal Death Category"],y = data["PNC1"], data = data)
plt.ylabel("Postnatal Care-1")
plt.xlabel("Maternal Death Category")
plt.title("Barplot of PNC1 and Maternal Death Category")
plt.show()

In [None]:
# Display barplot
sns.barplot( x= data["Maternal Death Category"],y = data["PNC2"], data = data)
plt.ylabel("Postnatal Care-1")
plt.xlabel("Maternal Death Category")
plt.title("Barplot of PNC2 and Maternal Death Category")
plt.show()

In [None]:
# Display crosstab
dis = pd.crosstab(data["Maternal Death Category"], data["District"])
dis

In [None]:
# Display heatmap
sns.heatmap(dis, annot = True, fmt = "d", cmap = "Blues")
plt.title("Heatmap")
plt.show()

In [None]:
# Display crosstab
month = pd.crosstab(data["Maternal Death Category"], data["Month"])
month

In [None]:
# Display heatmap
sns.heatmap(month, annot = True, fmt = "d", cmap = "Blues")
plt.title("Heatmap")
plt.show()

In [None]:
# Drop columns
X = data.drop(["Maternal Death", "Maternal Death Category"], axis = 1)
y = data["Maternal Death Category"]

In [None]:
# Find correlation among variables
cor = data.corr()

In [None]:
# Display a heatmap
plt.figure(figsize = (10, 8))
sns.heatmap(cor, cmap = "Blues", annot = True, square = True, center = 0, fmt = ".2f")
plt.title("Correlation Heatmap", fontsize = 16)
plt.xticks(rotation = 45)
plt.yticks(rotation = 0)
plt.show()

In [None]:
# Split the dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state = 42)

# Standardise dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Create RandomForestClassifier object
rfc = RandomForestClassifier()
# Train the model
rfc.fit(X_train_scaled, y_train)
# Predict validation data
rfc_yhat_val = rfc.predict(X_val_scaled)

# Calculate metrics for validation set
rfc_accuracy = accuracy_score(y_val, rfc_yhat_val)

rfc_pre = precision_score(y_val, rfc_yhat_val, average = "macro")
rfc_rec = recall_score(y_val, rfc_yhat_val, average = "macro")
rfc_f1 = f1_score(y_val, rfc_yhat_val, average = "macro")

print("Evalutaion metrics in validation set: ")
print(f"Accuracy in RandomForest Classifier: {rfc_accuracy: .2f}")
print(f"Precision Score in RandomForest Classifier: {rfc_pre: .2f}",)
print(f"Recall Score in RandomForest Classifier: {rfc_rec: .2f}",)
print(f"F-1 Score in RandomForest Classifier: {rfc_f1: .2f}",)

In [None]:
params = {"n_estimators" : [100, 200, 300],
          "max_depth" : [1, 2, 5],
          }
grid = GridSearchCV(RandomForestClassifier(), params , cv = 5)
grid.fit(X_train_scaled, y_train)

print("Best params: ", grid.best_params_)
best_model = grid.best_estimator_
y_pred = best_model.predict(X_val_scaled)

accuracy = accuracy_score(y_val, y_pred)
pre = precision_score(y_val, y_pred)
rec = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
print("Evalutaion metrics in validation set: ")
print(f"Accuracy in RandomForest Classifier: {accuracy: .2f}")
print(f"Precision Score in RandomForest Classifier: {pre: .2f}",)
print(f"Recall Score in RandomForest Classifier: {rec: .2f}",)
print(f"F-1 Score in RandomForest Classifier: {f1: .2f}",)

In [None]:
# Predict on test set
yhat_test = best_model.predict(X_test_scaled)

# Calculate metrics on test set
accuracy_test = accuracy_score(y_test, yhat_test)
pre_test = precision_score(y_test, yhat_test, average = "macro")
rec_test = recall_score(y_test, yhat_test, average = "macro")
f1_test = f1_score(y_test, yhat_test, average = "macro")

# Display the metrics
print("Evaluation metrics in test set:")
print(f"Accuracy in RandomForest Classifier: {accuracy_test: .2f}")
print(f"Precision Score in RandomForest Classifier: {pre_test: .2f}",)
print(f"Recall Score in  RandomForest Classifier: {rec_test: .2f}",)
print(f"F-1 Score in RandomForest Classifier: {f1_test: .2f}",)

In [None]:
# Predict on test set by default model
rfc_yhat_test = rfc.predict(X_test_scaled)

# Calculate metrics on test set
rfc_accuracy = accuracy_score(y_test, rfc_yhat_test)
rfc_pre_test = precision_score(y_test, rfc_yhat_test, average = "macro")
rfc_rec_test = recall_score(y_test, rfc_yhat_test, average = "macro")
rfc_f1_test = f1_score(y_test, rfc_yhat_test, average = "macro")

# Display the metrics
print("Evaluation metrics in test set:")
print(f"Accuracy in RandomForest Classifier: {rfc_accuracy: .2f}")
print(f"Precision Score in RandomForest Classifier: {rfc_pre: .2f}",)
print(f"Recall Score in K RandomForest Classifier: {rfc_rec: .2f}",)
print(f"F-1 Score in K RandomForest Classifier: {rfc_f1: .2f}",)

In [None]:
# Build confusion matrix and display heatmap
cma = confusion_matrix(y_test, rfc_yhat_test)
sns.heatmap(cma, annot = True, fmt = "d", cmap = "Blues")
plt.xlabel("Predicted Values")
plt.ylabel("Actual Values")
plt.title("Confusion Matrix of RandomForestClassifier")
plt.show()

In [None]:
# Create KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 7)
# Train the model
knn.fit(X_train, y_train)

# Predict on validation set
knn_yhat_val = knn.predict(X_val)
# Calculate metrics on validation set
accuracy_val = accuracy_score(y_val, knn_yhat_val)
pre_val = precision_score(y_val, knn_yhat_val, average = "macro")
rec_val = recall_score(y_val, knn_yhat_val, average = "macro")
f1_val = f1_score(y_val, knn_yhat_val, average = "macro")

# Display metrics on validation set
print(f"Accuracy in K Nearest Neighbors: {accuracy_val: .2f}")
print(f"Precision Score in K nearest neighbor: {pre_val: .2f}",)
print(f"Recall Score in K nearest neighbor: {rec_val: .2f}",)
print(f"F-1 Score in K nearest neighbor: {f1_val: .2f}",)

In [None]:
# Predict on test set
knn_yhat_test = knn.predict(X_test)

# Calculate metrics on test set
accuracy_test = accuracy_score(y_test, knn_yhat_test)
pre_test = precision_score(y_test, knn_yhat_test, average = "macro")
rec_test = recall_score(y_test, knn_yhat_test, average = "macro")
f1_test = f1_score(y_test, knn_yhat_test, average = "macro")

# Display the metrics on test set
print(f"Accuracy in K Nearest Neighbors: {accuracy_test: .2f}")
print(f"Precision Score in K nearest neighbor: {pre_test: .2f}",)
print(f"Recall Score in K nearest neighbor: {rec_test: .2f}",)
print(f"F-1 Score in K nearest neighbor: {f1_test: .2f}",)

In [None]:
# Create confusion matrix and display a heatmap
cm = confusion_matrix(y_test, knn_yhat_test)
sns.heatmap(cm, annot = True, fmt = "d", cmap = "Blues")
plt.xlabel("Predicted Values")
plt.ylabel("Actual Values")
plt.title("Confusion Matrix of KNeighborsClassifier")
plt.show()

In [None]:
# Create DMatrix
xgb_train = xgb.DMatrix(X_train, y_train)
xgb_test = xgb.DMatrix(X_test, y_test)

In [None]:
# Create and train xgb model
params = {'objective': 'multi:softmax',
          "max_depth" : 3,
          "learning_rate" : 0.01,
          "num_class" : 4}
n = 50
model = xgb.train(params = params, dtrain = xgb_train, num_boost_round = n)
xgb_yhat = model.predict(xgb_test)



In [None]:
# Get label of test set
xgb_test = xgb_test.get_label()
# Calculate metrics on test set
xgb_accuracy = accuracy_score(xgb_test, xgb_yhat)
xgb_pre = precision_score(xgb_test, xgb_yhat)
xgb_rec = recall_score(xgb_test, xgb_yhat)
xgb_f1 = f1_score(xgb_test, xgb_yhat)

# Display the metrics on test set
print(f"Accuracy in XGBoost: {xgb_accuracy: .2f}")
print(f"Precision in XGBoost: {xgb_pre: .2f}")
print(f"Recall in XGBoost: {xgb_rec: .2f}")
print(f"F-1 score in XGBoost: {xgb_f1: .2f}")
print("Confusion Matrix: ")
print(confusion_matrix(xgb_test, xgb_yhat))

In [None]:
# Create a confusion matrix and display a heatmap
cmat = confusion_matrix(xgb_test, xgb_yhat)
sns.heatmap(cmat, annot = True, fmt = "d", cmap = "Blues")
plt.xlabel("Predicted Values")
plt.ylabel("Actual Values")
plt.title("Confusion Matrix of XGBoost")
plt.show()