# Intro

HELP International is an international humanitarian NGO that is committed to fighting poverty and providing the people of backward countries with basic amenities and relief during the time of disasters and natural calamities. It runs a lot of operational projects from time to time along with advocacy drives to raise awareness as well as for funding purposes.

After the recent project that included a lot of awareness drives and funding programmes, they have been able to raise around $ 10 million. Now the CEO of the NGO needs to decide how to use this money strategically and effectively. The significant issues that come while making this decision are mostly related to choosing the countries that are in the direst need of aid.

And this is where you come in as a data analyst. Your job is to categorise the countries using some socio-economic and health factors that determine the overall development of the country. Then you need to suggest the countries which the CEO needs to focus on the most.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Reading & Understanding Data

In [None]:
df = pd.read_csv('/kaggle/input/country-socioeconomic-data/Country-data.csv')
df.head()

First we have to look at the information from the dataset first, country column is the only column that has an object type. Each feature column has no null/Nan data.

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
column = (df.columns).to_list()
column = column[1:]

# EDA

In [None]:
def univariate(data, column):
    fig, ax = plt.subplots(1, 3, figsize=(18,6))
    sns.histplot(data=data, x=column, ax=ax[0])
    sns.kdeplot(data=data, x=column, ax=ax[1])
    sns.boxplot(y =column, data=data, ax=ax[2])
    plt.show()

In [None]:
for feature in column: 
    univariate(df, feature)

to find out the relation of each feature/columns, we can make a correlation matrix

In [None]:
df_heat = df.drop(columns="country")
fig, ax = plt.subplots(figsize=(16,12))
ax = sns.heatmap(df_heat.corr(), annot=True, ax=ax, square=True, vmin=-1, vmax=1)

as we can see from the heatmap above, we can conclude :
* The correlation between income and gdp per capita is very strong, the higher the income, the higher the gdp per capita.
* The higher the total fertility, the higher the child mortality rate.
* The higher the total fertility, the lower the life expectation number.
* The higher the life expectation rate, the lower the child mortality rate.
* The higher the import rate, the higher the export rate.
* The more the income, the higher the export figure.
* The higher the income, the higher the life expectation rate.

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(24,24))
data = df.sort_values(by=["child_mort"], ascending=False)
sns.barplot(data=data[:10], x="child_mort", y="country", ax=ax[0][0])

data = df.sort_values(by=["income"], ascending=True)
sns.barplot(data=data[:10], x="income", y="country", ax=ax[0][1])

data = df.sort_values(by=["inflation"], ascending=False)
sns.barplot(data=data[:10], x="inflation", y="country", ax=ax[1][0])

data = df.sort_values(by=["life_expec"], ascending=True)
sns.barplot(data=data[:10], x="life_expec", y="country", ax=ax[1][1])

data = df.sort_values(by=["total_fer"], ascending=False)
sns.barplot(data=data[:10], x="total_fer", y="country", ax=ax[2][0])

data = df.sort_values(by=["gdpp"], ascending=True)
sns.barplot(data=data[:10], x="gdpp", y="country", ax=ax[2][1])

plt.show()

# Handling Outliers

for handling outlier, we will replace the outlier data with their own upperbound or lower bound (based on quantile) per feature.

If the outliers are located more than their upperbound, then we will replace them with their own upperbound.

and If the outliers are located less than their lowerbound, then we will replace them with their own lowerbound.

In [None]:
def outliers(df, column):
  q1 = df[column].quantile(0.25)
  q3 = df[column].quantile(0.75)
  iqr = q3-q1
  lower_bound = q1-1.5*iqr
  upper_bound = q3+1.5*iqr

  for i in range(len(df)):
      if df[column].iloc[i] > upper_bound:
          df[column].iloc[i] = upper_bound
      if df[column].iloc[i] < lower_bound:
          df[column].iloc[i] = lower_bound

            
for feature in column:
  outliers(df, feature)

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(20,14))
sns.boxplot(y = 'child_mort', data = df, ax=ax[0][0])
sns.boxplot(y = 'exports', data = df, ax=ax[0][1])
sns.boxplot(y = 'imports', data = df, ax=ax[0][2])

sns.boxplot(y = 'health', data = df, ax=ax[1][0])
sns.boxplot(y = 'income', data = df, ax=ax[1][1])
sns.boxplot(y = 'inflation', data = df, ax=ax[1][2])

sns.boxplot(y = 'life_expec', data = df, ax=ax[2][0])
sns.boxplot(y = 'total_fer', data = df, ax=ax[2][1])
sns.boxplot(y = 'gdpp', data = df, ax=ax[2][2])

plt.suptitle("Data after Outliers Handle", x=0.5, y=0.92, size='large', weight='bold')
plt.show()

# Scaling Data

we will use the MinMaxScaler to scaling the data. This data scaling is useful for accelerating the calculation process at kmeans and making the value range the same which is [0,1].

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_train = df[column]
data_train[column] = scaler.fit_transform(data_train)

In [None]:
data_train.head()

# KMeans Clustering

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=100).fit(data_train)
labels = kmeans.labels_

country_data_with_label = df.copy()
country_data_with_label["cluster"] = labels

data_clustering = data_train.copy()
data_clustering['country'] = df["country"]
data_clustering["cluster"] = labels

In [None]:
cluster_map = {
    0 : "not worthy of receiving help", 
    1 : "deserves help", 
    2 : "less worthy of receiving help"
}

data_clustering = data_clustering.replace({"cluster" : cluster_map})

In [None]:
plt.figure(figsize=(16,10))
scat = sns.scatterplot(data=data_clustering, x="gdpp", y="child_mort", hue="cluster")
plt.suptitle('Clustering', y=0.95, weight="bold", fontsize=20)
plt.title("the lower the gdpp, the higher rate of child mortality rate", weight='light')

centers = kmeans.cluster_centers_
plt.scatter(centers[:,8], centers[:, 0], c='Red', s = 200)

scat.legend(fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(16,10))
scat = sns.scatterplot(data=data_clustering, x="gdpp", y="life_expec", hue="cluster")
plt.suptitle('Clustering', y=0.95, weight="bold", fontsize=20)
plt.title("the lower the gdpp, the lower rate of life expectations", weight='light')

centers = kmeans.cluster_centers_
plt.scatter(centers[:,8], centers[:, 6], c='Red', s = 200)

scat.legend(fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(16,10))
scat = sns.scatterplot(data=data_clustering, x="child_mort", y="life_expec", hue="cluster")
plt.suptitle('Clustering', y=0.95, weight="bold", fontsize=20)
plt.title("the higher child mortality rate makes lower the life expectations ", weight='light')

centers = kmeans.cluster_centers_
plt.scatter(centers[:,0], centers[:,6], c='Red', s = 200)

scat.legend(fontsize=14)
plt.show()

# Elbow Method

we need to evaluate the model, to know the K that most effective for KMeans we can use the elbow method.

In [None]:
wcss=[]
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 40)
    kmeans.fit(data_train)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(12,8))
plt.plot(range(1, 11), wcss)
plt.scatter(range(1, 11), wcss)

plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
cluster_map = {
    0 : "not worthy of receiving help", 
    1 : "deserves help", 
    2 : "less worthy of receiving help"
}

country_data_with_label = country_data_with_label.replace({"cluster" : cluster_map})

In [None]:
country_data_with_label = country_data_with_label[country_data_with_label.cluster == "deserves help"]

# Recommendations of Countries to help

1. Based on gdpp, we can help this 10 country because bad economy can lead to poor health 

In [None]:
country_data_with_label = country_data_with_label.sort_values(by=['gdpp'], ascending=True)
country_data_with_label[:10]

2. Based on child mortality, we can help this 10 country. If the rate of child mortality is high, that can indicate there is something wrong with the country either because poor economy or poor health and it could be both.

In [None]:
country_data_with_label = country_data_with_label.sort_values(by=['child_mort'], ascending=False)
country_data_with_label[:10]