### Importing Required Libraries 

In [None]:
import numpy as np #For linear algebraic functions
import pandas as pd #for data handling

#Data Visualization libraries 
import matplotlib.pyplot as plt
import seaborn as sns

#For data preprocessing 
from scipy.stats import zscore #Detecting outliers
from sklearn.preprocessing import StandardScaler #Normalizing data

#For clustering and evaluation
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


import warnings
warnings.filterwarnings(action = 'ignore')

In [None]:
data = pd.read_csv("/home/rachit/Desktop/internship/input/AHS_districtwise_2012-13.csv")

# Data Exploration 

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
data.describe().T

In [None]:
columns = list(data.columns)
columns

In [None]:
data.info()

### Reducing dimensionality of the dataset by selecting a subset of features. 

In [None]:
reduced_features = ['State','SAMPLE PARTICULARS', 'HOUSEHOLD CHARACTERISTICS', 'EFFECTIVE LITERACY RATE','MARRAIGE', 
            'WORK STATUS', 'ACUTE ILLNESS', 'CHRONIC ILLNESS', 'FERTILITY','ANTE NATAL CARE','POST NATAL CARE',
           'DELIVERY CARE', 'IMMUNIZATION', 'CHILDHOOD DISEASES', 'BREASTFEEDING', 'MORTALITY']
new_column = []
for col in columns:
    for feature in reduced_features:
        if feature in col:
            new_column.append(col)
np.size(new_column)

Further removing Male, Female, Rural and Urban subdivisions in the data. 

In [None]:
reduced_column = list(new_column)
for col in new_column:
    if 'Rural' in col or 'Urban' in col or 'Male' in col or 'Female' in col:
        reduced_column.remove(col)

In [None]:
len(reduced_column)

In [None]:
reduced_data = data[reduced_column]

Exploring / visualizing the reduced dataset.

In [None]:
corr = data.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(25, 23))

sns.heatmap(corr, mask = mask, ax=ax)

In [None]:
sns.pairplot(reduced_data.iloc[:, 10:21])

# Data Cleaning

In [None]:
#removing categorical columns State and State / District Name as they do not prove to be useful while 
#forming clusters
reduced_data = reduced_data.iloc[:, 2:]

In [None]:
#Checking for null values
reduced_data.isnull().sum().sort_values(ascending=False)

### Detecting Outliers

In [None]:
#detecting outliers in each column
Q1 = reduced_data.quantile(0.25)
Q3 = reduced_data.quantile(0.75)
IQR = Q3 - Q1
outliers = ((reduced_data < (Q1 - 2.4*IQR)) | (reduced_data>(Q3 + 2.4*IQR))).sum().sort_values(ascending=False)
print(outliers)

### Visualizing the outliers

In [None]:
fig, axes = plt.subplots(1,3, figsize=(18,7))
sns.boxplot(reduced_data['BREASTFEEDING AND SUPPLEMENTATION - AVERAGE MONTH BY WHICH CHILDREN RECEIVED FOODS OTHER THAN BREAST MILK - Vegetables/Fruits - Total'], orient='v', ax=axes[0])
sns.boxplot(reduced_data['CHRONIC ILLNESS - Having diagnosed for Chronic Illness (Per 100,000 Population) - Hypertension - Person - Total'], orient='v', ax=axes[1])
sns.boxplot(reduced_data['ACUTE ILLNESS - Persons suffering from Acute Illness (Per 100,000 Population) - Diarrhoea/Dysentery - Person - Total'], orient='v', ax=axes[2])
plt.show()

In [None]:
fig, axes = plt.subplots(3,1, figsize=(16,10))
axes[0].scatter(data['BREASTFEEDING AND SUPPLEMENTATION - AVERAGE MONTH BY WHICH CHILDREN RECEIVED FOODS OTHER THAN BREAST MILK - Vegetables/Fruits - Total'],data['CHRONIC ILLNESS - Having diagnosed for any kind of Chronic Illness and getting Regular Treatment from Government Source (%) - Person - Total'])
axes[0].set_title("Breastfeeding and Supplementation VS Chronic Disease")
axes[1].scatter(data['CHRONIC ILLNESS - Having diagnosed for Chronic Illness (Per 100,000 Population) - Hypertension - Person - Total'],data['CHRONIC ILLNESS - Having diagnosed for any kind of Chronic Illness and getting Regular Treatment from Government Source (%) - Person - Total'])
axes[1].set_title("Hypertension VS Any Chronic Disease")
axes[2].scatter(data['ACUTE ILLNESS - Persons suffering from Acute Illness (Per 100,000 Population) - Diarrhoea/Dysentery - Person - Total'] ,data['CHRONIC ILLNESS - Having diagnosed for any kind of Chronic Illness and getting Regular Treatment from Government Source (%) - Person - Total'])
axes[2].set_title("Acute Illness VS Chronic Disease")
plt.show()

### Removing outliers

In [None]:
reduced_data = reduced_data[~((reduced_data < (Q1 - 2.4 * IQR)) |(reduced_data > (Q3 + 2.4 * IQR))).any(axis=1)]
reduced_data = reduced_data.reset_index(drop=True)
reduced_data.shape


In [None]:
outliers = ((reduced_data < (Q1 - 2.4*IQR)) | (reduced_data>(Q3 + 2.4*IQR))).sum().sort_values(ascending=False)
print(outliers)

### Scaling the data

In [None]:
scale = StandardScaler()
scaled_data = scale.fit_transform(reduced_data)
scaled_df = pd.DataFrame(data=scaled_data, columns = reduced_data.columns)

In [None]:
fig, ax=plt.subplots(1,2,figsize=(15,5))
sns.distplot(reduced_data['CHRONIC ILLNESS - Having diagnosed for Chronic Illness (Per 100,000 Population) - Hypertension - Person - Total'],ax=ax[0],color='#D341CD')
ax[0].set_title("Original Data")
sns.distplot(scaled_df['CHRONIC ILLNESS - Having diagnosed for Chronic Illness (Per 100,000 Population) - Hypertension - Person - Total'], ax=ax[1],color='#D341CD')
ax[1].set_title("Scaled data")
plt.show()

# Clustering 

### Choosing number of clusters from the Elbow Method

In [None]:
cost = []
n_clusters = 20
for i in range(1,21):
    kmeans = KMeans(i)
    kmeans.fit(scaled_df)
    cost.append(kmeans.inertia_)

In [None]:
plt.figure(figsize=(8,5))
plt.plot(range(1,21), cost, marker='o')
plt.xlabel('Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Curve Method')
plt.show()

### Applying K-means with 7 clusters

In [None]:
#choosing 7 clusters
kmeans = KMeans(n_clusters = 7)
kmeans.fit(scaled_df)
labels = kmeans.labels_


In [None]:
print(silhouette_score(scaled_df, labels))

In [None]:
scaled_df['cluster'] = labels
temp = scaled_df.iloc[:, 90:-1]
temp['cluster'] = labels

In [None]:
sns.pairplot(data = temp, hue='cluster')

From the visualisation of the clusters, the results are not comprehensible and do not give us any information about the clusters due to high dimensionality of the dataset. This is known as <b> Curse of Dimensionality </b> and therefore we need to further reduce the features of the dataset.

Therefore we choose the following features that are relevant to the healthcare domain

In [None]:
relevant_columns = ['FERTILITY - Crude Birth Rate (CBR) - Total',
       'FERTILITY - Natural Growth Rate - Total',
       'FERTILITY - Total Fertility Rate - Total',
       'ANTE NATAL CARE - Mothers who received any Antenatal Check-up (%) - Total','ANTE NATAL CARE - Mothers who received at least one Tetanus Toxoid (TT) injection (%) - Total',
       'ANTE NATAL CARE - Mothers who consumed IFA for 100 days or more (%) - Total','DELIVERY CARE - Institutional Delivery (%) - Total','DELIVERY CARE - Delivery at Home (%) - Total','DELIVERY CARE - Delivery at Private Institution (%) - Total','POST NATAL CARE - Mothers who received Post-natal Check-up within 48 hrs. of delivery (%) - Total',
       'POST NATAL CARE - Mothers who received Post-natal Check-up within 1 week of delivery (%) - Total',
       'POST NATAL CARE - Mothers who did not receive any Post-natal Check-up (%) - Total',
       'POST NATAL CARE - New borns who were checked up within 24 hrs. of birth (%) - Total', 'IMMUNIZATION, VITAMIN A & IRON SUPPLEMENT AND BIRTH WEIGHT - Children aged 12-23 months Fully Immunized (%) - Total','IMMUNIZATION, VITAMIN A & IRON SUPPLEMENT AND BIRTH WEIGHT - Children who did not receive any vaccination (%) - Total','CHILDHOOD DISEASES - Children suffering from Diarrhoea (%) - Total',
       'CHILDHOOD DISEASES - Children suffering from Diarrhoea who received HAF/ORS/ORT (%) - Total',
       'CHILDHOOD DISEASES - Children suffering from Acute Respiratory Infection (%) - Total',
       'CHILDHOOD DISEASES - Children suffering from Acute Respiratory Infection who sought treatment (%) - Total',
       'CHILDHOOD DISEASES - Children suffering from Fever (%) - Total',
       'CHILDHOOD DISEASES - Children suffering from Fever who sought treatment (%) - Total',
       'BREASTFEEDING AND SUPPLEMENTATION - Children breastfed within one hour of birth (%) - Total',
       'BREASTFEEDING AND SUPPLEMENTATION - Children (aged 6-35 months) exclusively breastfed for at least six months (%) - Total',
       'BREASTFEEDING AND SUPPLEMENTATION - CHILDREN WHO RECEIVED FOODS OTHER THAN BREAST MILK DURING FIRST 6 MONTHS - Water (%) - Total',
       'BREASTFEEDING AND SUPPLEMENTATION - CHILDREN WHO RECEIVED FOODS OTHER THAN BREAST MILK DURING FIRST 6 MONTHS - Animal/Formula Milk (%) - Total',
       'BREASTFEEDING AND SUPPLEMENTATION - CHILDREN WHO RECEIVED FOODS OTHER THAN BREAST MILK DURING FIRST 6 MONTHS - Semi-Solid mashed food (%) - Total',
       'BREASTFEEDING AND SUPPLEMENTATION - CHILDREN WHO RECEIVED FOODS OTHER THAN BREAST MILK DURING FIRST 6 MONTHS - Solid (Adult) Food (%) - Total',
       'BREASTFEEDING AND SUPPLEMENTATION - CHILDREN WHO RECEIVED FOODS OTHER THAN BREAST MILK DURING FIRST 6 MONTHS - Vegetables/Fruits (%) - Total',
       'BREASTFEEDING AND SUPPLEMENTATION - AVERAGE MONTH BY WHICH CHILDREN RECEIVED FOODS OTHER THAN BREAST MILK - Water - Total',
       'BREASTFEEDING AND SUPPLEMENTATION - AVERAGE MONTH BY WHICH CHILDREN RECEIVED FOODS OTHER THAN BREAST MILK - Animal/Formula Milk - Total',
       'BREASTFEEDING AND SUPPLEMENTATION - AVERAGE MONTH BY WHICH CHILDREN RECEIVED FOODS OTHER THAN BREAST MILK - Semi-Solid mashed food - Total',
       'BREASTFEEDING AND SUPPLEMENTATION - AVERAGE MONTH BY WHICH CHILDREN RECEIVED FOODS OTHER THAN BREAST MILK - Solid (Adult) Food - Total',
       'BREASTFEEDING AND SUPPLEMENTATION - CHILDREN WHO RECEIVED FOODS OTHER THAN BREAST MILK DURING FIRST 6 MONTHS - Vegetables/Fruits (%) - Total',
       'MORTALITY - Crude Death Rate (CDR) - Total - Person',
       'MORTALITY - Infant Mortality Rate (IMR) - Total - Person',
       'MORTALITY - Neo-natal Mortality Rate - Total',
       'MORTALITY - Post Neo-natal Mortality Rate - Total',
       'MORTALITY - Under Five Mortality Rate (U5MR) - Total - Person',]

In [None]:
[relevant_columns.append(i) for i in scaled_df if 'CHRONIC' in i] 

In [None]:
relevant_df = scaled_df[relevant_columns]
original_df = reduced_data[relevant_columns]

In [None]:
#renaming the columns
relevant_df.columns = ['CBR','NGR','TFR','ANC-Any checkup','ANC-Received TT Injection', 'ANC-Consumed IFA for 100 days',
              'Delivery - Institutional', 'Delivery - Home','Delivery - Private', 'PNC-Checkup in 48 hrs','PNC-Checkup in 1 week',
              'PNC-No Checkup','PNC-Checked in 24 hrs (newborns)','Children - Immunized','Children - No vaccination',
              'Childhood Disease-Diarrhoea','Childhood Disease-Diarrhoea(HAF/ORS/ORT)','Childhood Disease-Acute Resp. Infection',
               'Childhood Disease-Acute Resp. Infection(Treated)','Childhood Disease-Fever','Childhood Disease-Fever(Treated)',
               'BFS-Breastfed in 1 hour','BFS-Breastfed for 6 months','BFS-Water','BFS-Animal/Formula Milk','BFS-Semi-solid Food',
               'BFS-Solid Food','BFS-Vegetable/Fruits','BFS(Month)-Water','BFS(Month)-Milk','BFS(Month)-Semi-solid food','BFS(Month)-Adult Food',
               'BFS(Month)-Vegetable/Fruits','Mortality-CDR','Mortality-IMR','Mortality-NNMR','Mortality-PNNMR',
               'Mortality-U5MR','Chronic Illness-Symptoms','Chronic Illness-Symptoms(Medical care)','Chronic Illness-Diabetes',
               'Chronic Illness-Hypertension','Chronic Illness-TB','Chronic Illness-Asthma','Chronic Illness-Arthritis','Chronic Illness-Any',
               'Chronic Illness-Receiving Treatment','Chronic Illness-Receiving Treatment(govt)']
original_df.columns = ['CBR','NGR','TFR','ANC-Any checkup','ANC-Received TT Injection', 'ANC-Consumed IFA for 100 days',
              'Delivery - Institutional', 'Delivery - Home','Delivery - Private', 'PNC-Checkup in 48 hrs','PNC-Checkup in 1 week',
              'PNC-No Checkup','PNC-Checked in 24 hrs (newborns)','Children - Immunized','Children - No vaccination',
              'Childhood Disease-Diarrhoea','Childhood Disease-Diarrhoea(HAF/ORS/ORT)','Childhood Disease-Acute Resp. Infection',
               'Childhood Disease-Acute Resp. Infection(Treated)','Childhood Disease-Fever','Childhood Disease-Fever(Treated)',
               'BFS-Breastfed in 1 hour','BFS-Breastfed for 6 months','BFS-Water','BFS-Animal/Formula Milk','BFS-Semi-solid Food',
               'BFS-Solid Food','BFS-Vegetable/Fruits','BFS(Month)-Water','BFS(Month)-Milk','BFS(Month)-Semi-solid food','BFS(Month)-Adult Food',
               'BFS(Month)-Vegetable/Fruits','Mortality-CDR','Mortality-IMR','Mortality-NNMR','Mortality-PNNMR',
               'Mortality-U5MR','Chronic Illness-Symptoms','Chronic Illness-Symptoms(Medical care)','Chronic Illness-Diabetes',
               'Chronic Illness-Hypertension','Chronic Illness-TB','Chronic Illness-Asthma','Chronic Illness-Arthritis','Chronic Illness-Any',
               'Chronic Illness-Receiving Treatment','Chronic Illness-Receiving Treatment(govt)']



In [None]:
relevant_df.shape

#### Again applying K-Means

In [None]:
cost = []
for k in range(1,21):
    model = KMeans(k)
    model.fit(relevant_df)
    cost.append(model.inertia_)

In [None]:
plt.figure(figsize=(8,5))
plt.plot(range(1,21), cost, marker='o')
plt.title("Elbow Curve Method")
plt.xlabel("No. of Clusters")
plt.ylabel("Inertia")
plt.show()

#### Choosing 5 clusters

In [None]:
model = KMeans(5)
model.fit(relevant_df)
labels = model.labels_


In [None]:
relevant_df['cluster'] = labels

In [None]:
data = relevant_df.iloc[:,:11]
data['cluster'] = relevant_df['cluster']
sns.pairplot(data=data, hue='cluster')

data = relevant_df.iloc[:, 11:21]
data['cluster'] = relevant_df['cluster']
sns.pairplot(data=data, hue='cluster', size=3)

data = relevant_df.iloc[:, 21:31]
data['cluster'] = relevant_df['cluster']
sns.pairplot(data=data, hue='cluster', size=3)

data = relevant_df.iloc[:, 31:41]
data['cluster'] = relevant_df['cluster']
sns.pairplot(data=data, hue='cluster', size=3)

data = relevant_df.iloc[:, 41:]
data['cluster'] = relevant_df['cluster']
sns.pairplot(data=data, hue='cluster', size=3)

In [None]:
relevant_df.drop('cluster', inplace = True, axis=1)

In [None]:
print(silhouette_score(relevant_df, labels))

In [None]:
reduced_df = relevant_df[['NGR','TFR','Delivery - Home', 'Delivery - Institutional', 'Children - No vaccination','BFS-Breastfed in 1 hour','BFS-Vegetable/Fruits','Mortality-CDR','Mortality-IMR','Chronic Illness-Symptoms','Chronic Illness-Any']]
original_df = original_df[['NGR','TFR','Delivery - Home', 'Delivery - Institutional', 'Children - No vaccination','BFS-Breastfed in 1 hour','BFS-Vegetable/Fruits','Mortality-CDR','Mortality-IMR','Chronic Illness-Symptoms','Chronic Illness-Any']]

In [None]:
reduced_df.shape

In [None]:
cost = []
for k in range(1,21):
    model = KMeans(k)
    model.fit(reduced_df)
    cost.append(model.inertia_)

plt.figure(figsize=(8,5))
plt.plot(range(1,21), cost, marker='o')
plt.title("Elbow Method")
plt.xlabel("No. of Clusters")
plt.ylabel("Inertia") 
plt.show()

In [None]:
# 6 clusters
kmeans = KMeans(n_clusters=6, init='k-means++', n_init=10, max_iter=600)
labels = kmeans.fit_predict(reduced_df)
print(silhouette_score(relevant_df, labels))

# Applying PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
df_pca = pca.fit_transform(reduced_df)
pca.explained_variance_ratio_

In [None]:
cost = []
for k in range(1,21):
    model = KMeans(k)
    model.fit(df_pca)
    cost.append(model.inertia_)
    
plt.figure(figsize=(8,5))
plt.plot(range(1,21), cost, marker='o')
plt.title("Elbow Method")
plt.xlabel("No. of Clusters")
plt.ylabel("Inertia") 
plt.show()

In [None]:
model2 = KMeans(n_clusters = 4, init='k-means++', n_init = 10, max_iter =600)
labels = model2.fit_predict(df_pca)

In [None]:
print(silhouette_score(df_pca, labels))

# Cluster Visualization

In [None]:
plt.figure(figsize=(13,10))
plt.scatter(df_pca[labels==0, 0], df_pca[labels==0, 1], color = 'red', label = 'Cluster 0')
plt.scatter(df_pca[labels==1, 0], df_pca[labels==1, 1], color = 'blue', label= 'Cluster 1')
plt.scatter(df_pca[labels==2, 0], df_pca[labels==2, 1], color= 'green', label='Cluster 2')
plt.scatter(df_pca[labels==3, 0], df_pca[labels==3, 1], color='yellow', label='Cluster 3')
plt.scatter(model2.cluster_centers_[:,0], model2.cluster_centers_[:,1], color='black',s=150, label='Centroids')
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.show()

In [None]:
original_df['cluster'] = labels

# Cluster Analysis

In [None]:
from collections import Counter
clusterCount = dict(Counter(labels))
clusterCount

In [None]:
clustersdf = []
for i in range(5):
    clustersdf.append(original_df[labels==i])
    clustersdf[i].reset_index(drop=True, inplace=True)

In [None]:
statsdf = original_df.describe().T[['min','max']]

statsdf['Cluster 0'] = clustersdf[0].mean().to_frame()

statsdf['Cluster 1'] = clustersdf[1].mean().to_frame()
statsdf['Cluster 2'] = clustersdf[2].mean().to_frame()
statsdf['Cluster 3'] = clustersdf[3].mean().to_frame()

In [None]:
statsdf

In [None]:
df_final = original_df[['Delivery - Home','Delivery - Institutional','BFS-Breastfed in 1 hour','BFS-Vegetable/Fruits',
                'Mortality-IMR','Chronic Illness-Symptoms','Chronic Illness-Any','cluster']]
statsFinal = df_final.describe().T[['min','max']]
statsFinal['Cluster 0'] = df_final[df_final['cluster']==0].mean()
statsFinal['Cluster 1'] = df_final[df_final['cluster']==1].mean()
statsFinal['Cluster 2'] = df_final[df_final['cluster']==2].mean()
statsFinal['Cluster 3'] = df_final[df_final['cluster']==3].mean()
statsFinal