In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input/help-international-data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**PROBLEM STATEMENT**

HELP International is an international humanitarian NGO that is committed to fighting poverty and providing the people of backward countries with basic amenities and relief during the time of disasters and natural calamities. It runs a lot of operational projects from time to time along with advocacy drives to raise awareness as well as for funding purposes.
After the recent funding programmes, they have been able to raise around $ 10 million. Now the CEO of the NGO needs to decide how to use this money strategically and effectively. The significant issues that come while making this decision are mostly related to choosing the countries that are in the direst need of aid. 
Our job is to categorize the countries using some socio-economic and health factors that determine the overall development of the country. Then we need to suggest the countries which the CEO needs to focus on the most. 


# Reading and Understanding data

In [None]:
#import warnings
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("always")

In [None]:
#importing all the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

In [None]:
# Reading the csv file
country=pd.read_csv("../input/help-international-data/Country-data.csv")
country.head()

In [None]:
# looking for shape
country.shape

In [None]:
# looking for types
country.info()

In [None]:
# checking the mean
country.describe

In [None]:
#checking columns
country.columns

In [None]:
#checking missing values
country.isnull().sum()

# Data Preparation

In [None]:
#converting 'exports' in actual values 
country['exports']=round((country['exports']*country['gdpp'])/100,2)

In [None]:
#converting 'health' in actual values
country['health']=round((country['health']*country['gdpp'])/100,2)

In [None]:
#converting 'imports' in actual values
country['imports']=round((country['imports']*country['gdpp'])/100,2)

In [None]:
# checking top 5 values
country.head()

# EDA

#### Barplot

In [None]:
#Perform Analysis for child_mort 
plt.figure(figsize=(15,10))
child_mort=country[['country','child_mort']].sort_values('child_mort',ascending=False).head(10)
ax=sns.barplot(x='country',y='child_mort',data=child_mort)
ax.set(xlabel='',ylabel='child_mortlity Rate')
plt.xticks(rotation=90)
plt.show()

The countries having high child mortality Rate according to EDA are:
    Haiti,Sierra Leone, Chad, Central African Republic, Mali, Nigeria, Niger, Angola,Burkina Faso, Congo, Dem. Rep.

In [None]:
plt.figure(figsize=(15,10))
income=country[['country','income']].sort_values('income',ascending=False).tail(10)
ax=sns.barplot(x='country',y='income',data=income)
ax.set(xlabel='',ylabel='income')
plt.xticks(rotation=90)
plt.show()

The countries having low Income according to EDA are:
    Sierra Leone, Togo, Guinea, Malawi, Mozambique,Central African Republic, Niger, Burundi, Liberia, Congo. Dem. Rep.

In [None]:
plt.figure(figsize=(15,10))
gdpp=country[['country','gdpp']].sort_values('gdpp',ascending=False).tail(10)
ax=sns.barplot(x='country',y='gdpp',data=gdpp)
ax.set(xlabel='',ylabel='GDPP')
plt.xticks(rotation=90)
plt.show()

The countries having low gdpp according to EDA are:-
Eritrea, Malawi, Central African Republic,Mozambique, Madagascar, Sierra Leone, Niger, Congo, Dem. Rep., Liberia, Burundi

#### Pairplot

In [None]:
# performing EDA by making pairplots
sns.set(style="ticks", color_codes=True)
sns.pairplot(country)
plt.show()

#### Distplot

In [None]:
# ploting the distplot
plt.figure(figsize = (15,10))
features = country.columns[1:]
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.distplot(country[i[1]])

#### Boxplot

In [None]:
#plotting boxplots
plt.figure(figsize = (15,10))
features = country.columns[1:]
for i in enumerate(features):
    plt.subplot(3,3,i[0]+1)
    sns.boxplot(country[i[1]])

# Outlier treatment

#### 1. child_mort

In [None]:
# capping the lower_end outliers from 'child_mort'
q1=country['child_mort'].quantile(0.01)
country['child_mort'][country['child_mort']<=q1] = q1

#### 2. exports

In [None]:
#capping upper end outliers from 'exports'
q2=country['exports'].quantile(0.99)
country['exports'][country['exports']>=q2] = q2

#### 3. health

In [None]:
#capping upper end outliers from 'health'
q3=country['health'].quantile(0.99)
country['health'][country['health']>=q3] = q3

#### 4. imports

In [None]:
#capping upper end outliers from 'imports'
q4=country['imports'].quantile(0.99)
country['imports'][country['imports']>=q4] = q4

#### 5. income

In [None]:
#capping upper end outliers from 'income'
q5=country['income'].quantile(0.99)
country['income'][country['income']>=q5] = q5

#### 6.Inflation

In [None]:
#capping upper end outliers from 'Inflation'
q6=country['inflation'].quantile(0.99)
country['inflation'][country['inflation']>=q6] = q6

#### 7. life_expec

In [None]:
#capping upper end outliers from 'life_expec'.
q7=country['life_expec'].quantile(0.99)
country['life_expec'][country['life_expec']>=q7] = q7

#### 8. gdpp

In [None]:
#capping upper end outliers from 'gdpp'.-
q9=country['gdpp'].quantile(0.99)
country['gdpp'][country['gdpp']>=q9] = q9

# Clustering

### Hopkins Score

In [None]:
#Calculating the Hopkins statistic
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
hopkins(country.drop('country',axis = 1))

In [None]:
#dropping 'country' to analyse the data.
country_new=country.drop('country',axis=1)

# Rescaling

In [None]:
#scaling
scaler=StandardScaler()
country_scaled=scaler.fit_transform(country_new)
country_scaled.shape

In [None]:
#converting to Dataframe.
country_scaled=pd.DataFrame(country_scaled)
country_scaled.columns=country_new.columns
country_scaled.head()

#### silhouette score

In [None]:
# calculting the silouette score
ssd=[]
for k in range(2,11):
    kmeans=KMeans(n_clusters=k)  
    kmeans.fit(country_scaled)      #fit the scaled data
    ssd.append([k,silhouette_score(country_scaled,kmeans.labels_)])  #kmeans.labels_
plt.plot(pd.DataFrame(ssd)[0],pd.DataFrame(ssd)[1]) #plotting the curve

#### Elbow curve

In [None]:
# plotting Elbow curve
ssd = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters = k)
    kmeans.fit(country_scaled)   
    ssd.append([k, kmeans.inertia_]) # kmeans.inertia_
    
plt.plot(pd.DataFrame(ssd)[0], pd.DataFrame(ssd)[1])#plotting the curve

So we are moving ahead with k=3 as from both the curve we are getting k=3.

# Modelling

In [None]:
# with k=3 making the the model
kmeans=KMeans(n_clusters=3,random_state=100)
kmeans.fit(country_scaled)

In [None]:
#cluster labels assigned
kmeans.labels_

# Cluster Analysis

In [None]:
# adding the cluster_labels to the main dataset
country['cluster_label']=kmeans.labels_
country.head()

In [None]:
#counting the cluster_labels in the main dataset
country.cluster_label.value_counts()

In [None]:
# analysing the clusters formed using scatterplot
sns.scatterplot(x = 'child_mort', y = 'gdpp', hue = 'cluster_label', data = country, palette = 'Set1')

In [None]:
# analysing the clusters formed using scatterplot
sns.scatterplot(x = 'income', y = 'gdpp', hue = 'cluster_label', data = country, palette = 'Set1')

In [None]:
# analysing the clusters formed using scatterplot
sns.scatterplot(x = 'child_mort', y = 'income', hue = 'cluster_label', data = country, palette = 'Set1')

In [None]:
# groupping the clusters so formed and finding the mean
country.drop('country',axis = 1).groupby('cluster_label').mean()

In [None]:
#getting country columns
country.columns

In [None]:
#again group by cluster labels and analysing only child_mort,income,gdpp
country.drop(['country', 'exports', 'health', 'imports',
       'inflation', 'life_expec', 'total_fer'],axis=1).groupby('cluster_label').mean().plot(kind='bar')

# Cluster Profiling

In [None]:
#finding the countries in cluster=0
country[country['cluster_label']==0]['country']

In [None]:
# cluster profiling , we need to fing the bottom most countries which are in need of aid.
#giving priority to child_mort'over 'income','gdpp' while sorting
country[country['cluster_label']==0].sort_values(by=['child_mort','income','gdpp'],ascending=[False,True,True]).head()

In [None]:
# cluster profiling , we need to fing the bottom most countries which are in need of aid.
#giving priority to 'gdpp' over 'income',child_mort' while sorting
country[country['cluster_label']==0].sort_values(by=['gdpp','income','child_mort'],ascending=[True,True,False]).head()

In [None]:
#Thees are the counytries which are really good and top countries in our data set with least child_mortality and very good gdpp and income
country[country['cluster_label']==1].sort_values(by=['child_mort','income','gdpp'],ascending=[True,False,False]).head()

# Hierchical Clustering

In [None]:
country_scaled.head()

In [None]:
#single linkage
plt.figure(figsize=(18,15))
country_mergings=linkage(country_scaled,method="single",metric="euclidean")
dendrogram(country_mergings)
plt.show()

In [None]:
#complete linkage
plt.figure(figsize=(18,15))
country_mergings=linkage(country_scaled,method="complete",metric="euclidean")
dendrogram(country_mergings)
plt.show()

In [None]:
#getting number of clusters
cut_tree(country_mergings,n_clusters=3).shape

In [None]:
# adding 'cluster_h_label' which is cluster_id according to hierarchical clustering to the main dataset
cluster_h_label=cut_tree(country_mergings,n_clusters=3).reshape(-1,)
country['cluster_h_label']=cluster_h_label
country.head()

In [None]:
#analysing cluster so formed using scatter plot
sns.scatterplot(x = 'child_mort', y = 'gdpp', hue = 'cluster_h_label', data = country, palette = 'Set1')

In [None]:
#analysing cluster so formed using scatter plot
sns.scatterplot(x = 'income', y = 'gdpp', hue = 'cluster_h_label', data = country, palette = 'Set1')

In [None]:
#analysing cluster so formed using scatter plot
sns.scatterplot(x = 'child_mort', y = 'income', hue = 'cluster_h_label', data = country, palette = 'Set1')

In [None]:
#group the cluster so formed and determining mean
country.drop(['country','cluster_label'],axis = 1).groupby('cluster_h_label').mean()

In [None]:
#cluster profiling where we are getting least five counytries with less gdp,less income, high child_mortality
country[country['cluster_h_label']==0].sort_values(by=['child_mort','income','gdpp'],ascending=[False,True,True]).head()

In [None]:
#cluster profiling where we are getting least five counytries with less gdp,less income, high child_mortality
country[country['cluster_h_label']==0].sort_values(by=['gdpp','income','child_mort'],ascending=[True,True,False]).head()

In [None]:
#cluster profiling where we are getting the countries having good gdp ,least child_mortality
country[country['cluster_h_label']==1].sort_values(by=['child_mort','income','gdpp'],ascending=[True,False,False]).head()

In [None]:
#count the number of elements in each cluster.
country.cluster_h_label.value_counts()

#### So we find the the countries which are in direst need of aid are:

##### 1. Haiti
##### 2. Sierra Leone
##### 3. Chad
##### 4. Central African Republic
##### 5. Mali
##### 6. Burundi

**Thank you for reading!**

**Comments and Suggestions are welcome!**