## Step 1: Reading and Understanding the Data

In [None]:
#Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Reading File
df= pd.read_csv("Country-data.csv")
df.head()

In [None]:
df.columns

In [None]:
# Converting exports,health,imports in actual values as its given as %age of the GDP per capita
features_std =['exports','health','imports']
for i in features_std:
    df[i]=(df[i]*df['gdpp'])/100


In [None]:
df.head()

In [None]:
# Checking shape of dataframe
df.shape

In [None]:
# Checking columns type in dataframe
df.info()

- ##### As there are no null values no need to handle that

In [None]:
# checking attributes for continuous variables
df.describe()

## Step 2: Visualising the Data

### 2.1 Univariate Analysis

In [None]:
# distribution of continuous variables
features = df.columns[1:]
plt.figure(figsize = (20,20))
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.distplot(df[i[1]])
    plt.xticks(rotation=90)
plt.show() 

- ##### All factors are distributed normally

### 2.2 Checking Outliers

In [None]:
# Univariate analysis & Outliers rcognition for continuous variables
features = df.columns[1:]
plt.figure(figsize = (20,12))
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.boxplot(df[i[1]])
    

#### Conclusion:
- ##### All continuous variables are having outliers
- ##### Except life_expec all other variables are havinhg higher outlires 
- ##### There are less outliers for life_expec and total_fer vraiables.
- ##### Max data for child mortality,total_fer,health,exports,imports,income,gdpp lies between 50th to 75th percentile)
- ##### Max data for life_excep lies between 25th to 50th percentile

### 2.3 Bivariate Analysis

##### 2.3.1 With respect to income

In [None]:
features = ['child_mort','exports','health','imports','inflation','life_expec','total_fer','gdpp']
plt.figure(figsize = (20,12))
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.scatterplot(x='income',y=df[i[1]], data=df)
    

- ##### income is inversely related to Child_mort. If income is increases then Child_mort decreases
- ##### income and gdpp are direcltly related. If income increases then gddp also increses
- ##### income and total_fer are inversely related.If income is increases then total_fer decreases
- ##### income and imports,exports are direcltly related. If income increases then imports, exports also increses 

##### 2.3.2 With respect to gdpp

In [None]:
features = ['child_mort','exports','health','imports','income','inflation','life_expec','total_fer']
plt.figure(figsize = (20,12))
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.scatterplot(x='gdpp',y=df[i[1]], data=df)

- ##### gdpp is inversely related to Child_mort. If gdpp is increases then Child_mort decreases
- ##### gdpp and health are direcltly related. If gdpp increases then health also increses
- ##### gdpp and total_fer are inversely related.If gdpp is increases then total_fer decreases
- ##### gdpp and imports,exports are direcltly related. If gdpp increases then imports, exports also increses 

##### 2.3.3 With respect to child_mort

In [None]:
features = ['exports','health','imports','income','inflation','life_expec','total_fer','gdpp']
plt.figure(figsize = (20,12))
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.scatterplot(x='child_mort',y=df[i[1]], data=df)

- ##### child_mort is inversely related to health. If health is increases then child_mort decreases
- ##### child_mort is inversely related to life_expec. If life_expec is increases then child_mort decrease
- ##### child_mort and total_fert are direcltly related. If total_fert increases then child_mort also increses

### 2.4 corelation

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(df.corr(), cmap='YlGnBu',annot=True)

- ##### As per the Heat map Exports and imports are higly and directly corelated to each other
- ##### Child_mort and life_expec are highly inversely corelated
- ##### Below are inversley corelated Variables 
  - ##### child_mort and gdpp
  - ##### child_mort and life_expec,
  - ##### child_mort and income
  - ##### child_mort and imports
  - ##### child_mort and health
  - ##### child_mort and exports
  - ##### exports and total_fer
  - ##### exports and inflation
  - ##### health and inflation
  - ##### imports and inflation
  - ##### imports and total_fer
  - ##### income and inflation
  - ##### inflation and gdpp
  - ##### inflation and life_expec
  - ##### life_expec and total_fer

## Step 3: Outlier treatment

In [None]:
# Clustering was not proper beacuse of outliers,hence caping/removing outliers 
# Caping child_mort lower outliers and keeping Higher as it will be helpful in clustering more child_mort means country is in need of help
q1 = df['child_mort'].quantile(0.01)
df['child_mort'][df['child_mort']<= q1] = q1

# Caping highier outlier for other variables
q3_exports = df['exports'].quantile(0.99)
df['exports'][df['exports']>= q3_exports] = q3_exports

q3_imports = df['imports'].quantile(0.99)
df['imports'][df['imports']>= q3_imports] = q3_imports

q3_health = df['health'].quantile(0.99)
df['health'][df['health']>= q3_health] = q3_health

q3_gdpp = df['gdpp'].quantile(0.99)
df['gdpp'][df['gdpp']>= q3_gdpp] = q3_gdpp

q3_life_expec = df['life_expec'].quantile(0.99)
df['life_expec'][df['life_expec']>= q3_life_expec] = q3_life_expec

q3_income = df['income'].quantile(0.99)
df['income'][df['income']>= q3_income] = q3_income

q3_inflation = df['inflation'].quantile(0.99)
df['inflation'][df['inflation']>= q3_inflation] = q3_inflation

q3_total_fer = df['total_fer'].quantile(0.99)
df['total_fer'][df['total_fer']>= q3_total_fer] = q3_total_fer

In [None]:
# Checking data after caping
df.describe()

In [None]:
# Visualising univatriate after outlier caping
features = df.columns[1:]
plt.figure(figsize = (20,12))
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.boxplot(df[i[1]])

## Step 4: Clustering

In [None]:
#Calculating the Hopkins statistic
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
hopkins(df.drop('country', axis=1))

- ##### As hopkins score is good(greater than 80) we are good to go for clustering

### Scaling

In [None]:
# scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df1= df.drop('country', axis=1)
df1 = scaler.fit_transform(df1)

In [None]:
df1 = pd.DataFrame(df1)
df1.columns = df.columns[1:]
df1.head()

### 4.1 K-Mean clustering

##### 4.1.1 Getting number of clusters based on silhouette and elbow curve

In [None]:
# Let's find out the value of K
# Silhouette Score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
ss = []
for k in range(2,11):
    kmeans = KMeans(n_clusters = k).fit(df1)
    ss.append([k, silhouette_score(df1, kmeans.labels_)])
    silhouette_avg = silhouette_score(df1, kmeans.labels_)
    print("For n_clusters={0}, the silhouette score is {1}".format(k, silhouette_avg))

In [None]:
# Elbow Curve
ssd = []
for k in range(2, 11):
    kmean = KMeans(n_clusters = k).fit(df1)
    ssd.append([k, kmean.inertia_])
    
plt.plot(pd.DataFrame(ssd)[0], pd.DataFrame(ssd)[1])

- ###### considering both silhouette_score and elbow curve 3 clusters are seems to be good as business point of view also

In [None]:
# K=3 taking 3 cluster
kmean = KMeans(n_clusters = 3, random_state = 50)
kmean.fit(df1)

In [None]:
kmean.labels_

In [None]:
label = pd.DataFrame(kmean.labels_, columns = ['label'])

In [None]:
df.kmean = df.copy()

In [None]:
df.kmean = pd.concat([df.kmean, label ], axis =1)

In [None]:
df.kmean.head()

In [None]:
# How many datapoints we have in each cluster
df.kmean.label.value_counts()

In [None]:
df.kmean.shape

#### 4.1.2 Cluster Visualisation

In [None]:
# Doing Visualisation based on child_mortality as based on that will decide for help
features = ['exports','health','imports','income','inflation','life_expec','total_fer','gdpp']
plt.figure(figsize = (20,12))
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.scatterplot(x='child_mort',y=df[i[1]], hue = 'label', data = df.kmean, palette = 'Set1')

- ##### There are three clusters based on below criteria for all variables:
 - ##### High
 - ##### Average
 - ##### Low

In [None]:
features = ['exports','health','imports','income','inflation','life_expec','total_fer','gdpp','child_mort']
plt.figure(figsize = (20,12))
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.barplot(x='label', y=df[i[1]], data=df.kmean)
 

-  ##### Cluster can be explained as below:
 - ##### Cluster 0 : high export, high health, high imports, high income, low inflation, high life_expec, low total_fer,                             high gdpp,low child_mort
 - ##### Cluster 1:  low export, low health, low imports, low income, high inflation, low life_expec, high total_fer,                               low gdpp,high child_mort
 - ##### Cluster 2:  avg export, avg health, avg imports, avg income, avg inflation, avg life_expec, avg total_fer,                                 avg gdpp,avg child_mort

In [None]:
# all in one to understand better
df.kmean.drop(['country'], axis = 1).groupby('label').mean().plot(figsize=(20,12),kind = 'bar')
plt.yscale('log')
plt.show()

- ##### companies belong to Cluster 1 are in dire need of aid as child mortality is more

#### 4.1.3 Cluster Profiling

In [None]:
# Cluster Profiling: Based on GDP CHILD MORT INCOME
df.kmean.drop('country',axis = 1).groupby('label').mean()

In [None]:
df.kmean.drop(['country','exports', 
               'health','imports','inflation',
               'life_expec','total_fer'], axis = 1).groupby('label').mean().plot(figsize=(20,12),kind = 'bar')
plt.show()

In [None]:
# As in above Bar graph Child portality was not visible becase of small scale, hence taking log

df.kmean.drop(['country','exports', 
               'health','imports','inflation',
               'life_expec','total_fer'], axis = 1).groupby('label').mean().plot(figsize=(20,12),kind = 'bar')
plt.yscale('log')
plt.show()

- ##### companies belong to Cluster 1(high child_mort,lowincome,lowgdpp) are in dire need of aid

In [None]:
# Getting top 5 companies which are in dire need of aid
df.kmean[df.kmean['label'] ==1].sort_values(by = ['child_mort','income', 'gdpp'], ascending = [False, True,True]).head(5)

### 4.2 Hierarichal clustering

##### 4.2.1 Single linkage

In [None]:
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree
df1=df.drop('country',axis=1)
mergings = linkage(df1, method="single", metric='euclidean')
dendrogram(mergings)
plt.show()

- ##### Above dendogram is mess as single linkage type produce dendrograms which are not structured properly beacuse the
         the distance between 2 clusters is defined as the shortest distance between points in the two clusters


##### 4.2.2 Complete linkage

In [None]:
plt.figure(figsize= (18,7))
mergings = linkage(df1, method="complete", metric='euclidean')
dendrogram(mergings)
plt.show()

In [None]:
# By taking horizontal cut at hieght 7000 will get 3 clusters
cluster_labels = cut_tree(mergings, n_clusters=3).reshape(-1, )
cluster_labels

In [None]:
df.hier = df.copy()

In [None]:
df.hier['label'] = cluster_labels
df.hier.head()

In [None]:
# How many datapoints we have in each cluster
df.hier.label.value_counts()

In [None]:
# Checking shape of df.hier
df.hier.shape

#### 4.2.3 Cluster Visualisation

In [None]:
# Doing Visualisation based on child_mortality as based on that will decide for help
features = ['exports','health','imports','income','inflation','life_expec','total_fer','gdpp']
plt.figure(figsize = (20,12))
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.scatterplot(x='child_mort',y=df[i[1]], hue = 'label', data = df.hier, palette = 'Set1')

- ##### There are three clusters based on below criteria for all variables:
 - ##### High
 - ##### Average
 - ##### Low

In [None]:
features = ['exports','health','imports','income','inflation','life_expec','total_fer','gdpp','child_mort']
plt.figure(figsize = (20,12))
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.barplot(x='label', y=df[i[1]], data=df.hier)

-  ##### Cluster can be explained as below:
 - ##### Cluster 0:  low export, low health, low imports, low income, high inflation, low life_expec, high total_fer,                               low gdpp,high child_mort
 
 - ##### Cluster 1:  avg export, avg health, avg imports, avg income, avg inflation, avg life_expec, avg total_fer,                                 avg gdpp,avg child_mort
 
 - ##### Cluster 2 : high export, high health, high imports, high income, low inflation, high life_expec, low total_fr,                             high gdpp,low child_mort

In [None]:
# all in one to understand better
df.hier.drop(['country'], axis = 1).groupby('label').mean().plot(figsize=(20,12),kind = 'bar')
plt.yscale('log')
plt.show()

- ##### companies belong to Cluster 0 are in dire need of aid as child mortality is more

#### 4.2.4 Cluster Profiling

In [None]:
df.hier.drop('country',axis = 1).groupby('label').mean()

In [None]:
df.hier.drop(['country','exports', 
               'health','imports','inflation',
               'life_expec','total_fer'], axis = 1).groupby('label').mean().plot(figsize=(20,12),kind = 'bar')
plt.show()

In [None]:
# As in above Bar graph Child portality was not visible becase of small scale, hence taking log
plt.figure(figsize=(10,30))

df.hier.drop(['country','exports', 
               'health','imports','inflation',
               'life_expec','total_fer'], axis = 1).groupby('label').mean().plot(figsize=(20,12),kind = 'bar')
plt.yscale('log')
plt.show()

- ##### companies belong to Cluster 1(high child_mort,lowincome,lowgdpp) are in dire need of aid

#### 4.2.5 Getting top 5 companies are in dire need of ais

In [None]:
df.hier[df.hier['label'] ==0].sort_values(by = ['child_mort','income', 'gdpp' ], ascending = [False, True,True]).head(5)

# Summary: 

- ##### After compairing both algos as K-means are having relative balanced no of countries in all clusters. Hence we can consider K-means as final approach.

- ##### Countries Haiti, Sierra Leone, chad,Central African Republic and Mail are the top 5 companies which are in dire need for add as for these companied child_mortality is high and income and gdpp is low.

- ##### Companies available in K-mean clustering to cluster 1 and in Hierarichal clustering to cluster 0 are the countries which needs aid.