
# Clustering the Countries by using Unsupervised Learning for HELP International

**Objective:**

To categorise the countries using socio-economic and health factors that determine the overall development of the country.

**About organization:**

HELP International is an international humanitarian NGO that is committed to fighting poverty and providing the people of backward countries with basic amenities and relief during the time of disasters and natural calamities.

**Problem Statement:**

HELP International have been able to raise around $ 10 million. Now the CEO of the NGO needs to decide how to use this money strategically and effectively. So, CEO has to make decision to choose the countries that are in the direst need of aid. Hence, your Job as a Data scientist is to categorise the countries using some socio-economic and health factors that determine the overall development of the country. Then you need to suggest the countries which the CEO needs to focus on the most.



In [None]:
#imports
import pandas as pd

In [None]:
df=pd.read_csv("../input/unsupervised-learning-on-country-data/Country-data.csv")
info=pd.read_csv("../input/unsupervised-learning-on-country-data/data-dictionary.csv")

In [None]:
df_copy_2=df.copy()

#EDA

In [None]:
info

In [None]:
df

In [None]:
df.describe()

In [None]:
# analyse graphique bidimentionnelle
from pandas.plotting import scatter_matrix 
scatter_matrix(df,figsize=(15,15))

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df.info()

In [None]:
#quantitative variables 
x = df.iloc[:,1:10].values
#qualitative variables
y= df.iloc[:,0]

In [None]:
x.shape

In [None]:
y

In [None]:
#correlation matrix 
corr = df.corr()
print(corr)

In [None]:
#correlation graph
import seaborn as sns 
sns.heatmap(corr,xticklabels=corr.columns.values,
                 yticklabels=corr.columns.values)

**Interperetation**


1. **child_mort :**
    + positive corr : total_fer
    + negative corr : gdpp , life_expec


2. **exports:**
    + positive corr :imports
    + negative corr :
3. **income**
    + positive corr : gdpp
    + negative corr : total_fer,child_mort
4. **life expec**
    + positicve corr :gdpp,income
    + negative corr:total_fer,child_mort

In [None]:
#covariance matrix 
#Any covariance matrix is symmetric and positive semi-definite and its main diagonal contains variances (i.e., the covariance of each element with itself).

cov_mat=df.cov()
print(cov_mat)

In [None]:
#standard scaling
from sklearn import preprocessing 
df_stsc=preprocessing.scale(x)
print(df_stsc)

In [None]:
#vecteur propres et valeurs propres 
import numpy as np 
vals,vecs=np.linalg.eig(corr)
print(vals)
print(vecs)

In [None]:
print('vectors \n%s'%vecs)
print ('vals \n%s '%vals)

In [None]:
#(valeur prpr , vecteurs propres correspondants )
pairs=[(np.abs(vals[i]),vecs[:,i])for i in range(len(vals))]
print(pairs[0])

In [None]:
#ascendent order of pairs 
pairs.sort(key=lambda x : x[0],reverse =True)
print(pairs)

In [None]:
len(pairs)

In [None]:
x

In [None]:
tot=sum(vals)
var_exp = [(i / tot)*100 for i in sorted(vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
matrix_w = np.hstack((pairs[0][1].reshape(9,1),
                      pairs[1][1].reshape(9,1)))

print('Matrix W:\n', matrix_w)
Y = x.dot(matrix_w)
print(Y)

# PCA

https://www.kaggle.com/chandrimad31/different-clustering-techniques-country-profiles/notebook 

In [None]:
from sklearn.decomposition import PCA 

country_pca=PCA(n_components=9).fit(df_stsc)


#Cumulative Variance explained by each PC
import numpy as np
cum_var = np.cumsum(np.round(country_pca.explained_variance_ratio_, decimals=4)*100)
print(cum_var)

In [None]:
#The amount of variance that each PC explains
var = country_pca.explained_variance_ratio_
print(var)

In [None]:
import matplotlib.pyplot as plt


#Plot explained variance ratio for each PC
plt.bar([i for i, _ in enumerate(var)],var,color='green')
plt.title('PCs and their Explained Variance Ratio', fontsize=15)
plt.xlabel('Number of components',fontsize=12)
plt.ylabel('Explained Variance Ratio',fontsize=12)


Using these cumulative variance ratios for all PCs, we will now draw a scree plot. It is used to determine the number of principal components to keep in this principal component analysis.

In [None]:
# Scree Plot
plt.plot(cum_var, marker='o')
plt.title('Scree Plot: PCs and their Cumulative Explained Variance Ratio',fontsize=15)
plt.xlabel('Number of components',fontsize=12)
plt.ylabel('Cumulative Explained Variance Ratio',fontsize=12)

In [None]:
final_pca=PCA(n_components=5).fit(df_stsc).transform(df_stsc)


In [None]:
final_pca

In [None]:
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
 
#df = sns.load_dataset('iris')
 
n_components = 5
 
# Do the PCA.
pca = PCA(n_components=n_components)
reduced = pca.fit_transform(df_stsc)

# Append the principle components for each entry to the dataframe
for i in range(0, n_components):
    df['PC' + str(i + 1)] = reduced[:, i]

display(df.head())


In [None]:
df_copy=df.drop('country',axis=1)

In [None]:
df_copy

In [None]:

def plot_circle_correlation(PC1,PC2):
  # Plot a variable factor map for the first two dimensions.
  (fig, ax) = plt.subplots(figsize=(8, 10))
  for i in range(0, pca.components_.shape[1]):
      ax.arrow(0,
              0,  # Start the arrow at the origin
              pca.components_[PC1, i],  #0 for PC1
              pca.components_[PC2, i],  #1 for PC2
              head_width=0.1,
              head_length=0.1)

      plt.text(pca.components_[PC1, i] + 0.05,
              pca.components_[PC2, i] + 0.05,
              df_copy.columns.values[i])


  an = np.linspace(0, 2 * np.pi, 100)
  plt.plot(np.cos(an), np.sin(an))  # Add a unit circle for scale
  plt.axis('equal')
  ax.set_title('Variable factor map')
  plt.show()

###correlation  PC 0 AND PC 1

In [None]:
plot_circle_correlation(0,1)

In [None]:
plot_circle_correlation(1,2)

In [None]:
plot_circle_correlation(2,3)

In [None]:
plot_circle_correlation(3,4)

In [None]:
df_pca=pd.DataFrame(data=df_copy,
                   columns=["PC1","PC2","PC3","PC4","PC5"])

In [None]:
df_pca

#Clustering

https://www.kaggle.com/shadanwar/clustering-kmeans-agc-pca

In [None]:
from sklearn.cluster import KMeans

In [None]:
#Plotting Elbow Curve
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from sklearn import metrics

model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,10))
visualizer.fit(df_stsc)    
visualizer.poof()

In [None]:
# define a dictionary that contains all of our relevant info.
results = []

# define how many clusters we want to test up to.
num_of_clusters = 10

# run through each instance of K
for k in range(2, num_of_clusters):
    
    print("-"*100)
    
    # create an instance of the model, and fit the training data to it.
    kmeans = KMeans(n_clusters=k, random_state=0).fit(df_pca)
    
    
    # store the different metrics
#     results_dict_pca[k]['silhouette_score'] = sil_score
#     results_dict_pca[k]['inertia'] = kmeans.inertia_
#     results_dict_pca[k]['score'] = kmeans.score
#     results_dict_pca[k]['model'] = kmeans
    
    results.append(kmeans.inertia_)
    
    # print the results    
    print("Number of Clusters: {}".format(k),kmeans.inertia_)


In [None]:
plt.figure(figsize=(15,8)) 

plt.plot(range(2, num_of_clusters), results, 'bx-')


plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
from sklearn.metrics import davies_bouldin_score, silhouette_score, silhouette_samples
sse,db,slc = {}, {}, {}
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000,random_state=12345).fit(df_stsc)
    if k == 4: labels = kmeans.labels_
    clusters = kmeans.labels_
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
    db[k] = davies_bouldin_score(df_stsc,clusters)
    slc[k] = silhouette_score(df_stsc,clusters)

In [None]:
#Plotting Davies-Bouldin Scores
plt.figure(figsize=(12,6))
plt.plot(list(db.keys()), list(db.values()))
plt.xlabel("Number of cluster", fontsize=12)
plt.ylabel("Davies-Bouldin values", fontsize=12)
plt.title("Davies-Bouldin Scores vs No. of Clusters", fontsize=15)
plt.show()

In [None]:
model=KMeans(n_clusters=3,random_state=20)
model.fit(df_stsc)


In [None]:
print(model.labels_)

In [None]:
pd.Series(model.labels_).value_counts()

Calculate silouhette coefficient 

In [None]:
from sklearn import metrics 
metrics.silhouette_score(df_stsc,model.labels_)

In [None]:
df_copy_2["preds"]=model.labels_

In [None]:
df_copy_2

In [None]:
#Visualize clusters: Feature Pair-2
plt.figure(figsize=(12,6))
plt.scatter(df_stsc[:,4],df_stsc[:,8],c=df_copy_2.preds) # income vs gdpp
plt.title("Income vs GDPP (Visualize KMeans Clusters)", fontsize=15)
plt.xlabel("Income", fontsize=12)
plt.ylabel("GDPP", fontsize=12)
plt.rcParams['axes.facecolor'] = 'lightblue'
plt.show()

In [None]:
df_copy['country']=df_copy_2["country"]
df_copy['preds']=df_copy_2["preds"]
df_copy['index']=[ i for i in range(167) ]

In [None]:
df_copy

In [None]:
labs1=list(df["country"])
labs2=list(df_copy["index"])
x=list(df_copy["PC1"])
y=list(df_copy["PC2"])
z=list(df_copy["PC3"])
t=list(df_copy["PC4"])
w=list(df_copy["PC5"])



In [None]:

def scatterpolt_pca(df,PC1,PC2,target,list_PC1,list_PC2):
  g =sns.scatterplot(x=PC1, y=PC2,
                hue=target,
                data=df,style=target);
  for i, txt in enumerate(labs1):
    plt.annotate(txt, (list_PC1[i], list_PC2[i]))


In [None]:
scatterpolt_pca(df_copy,"PC1","PC2","preds",x,y)

In [None]:
scatterpolt_pca(df_copy,"PC2","PC3","preds",y,z)

In [None]:
scatterpolt_pca(df_copy,"PC3","PC4","preds",z,t)

In [None]:
scatterpolt_pca(df_copy,"PC4","PC5","preds",t,w)

In [None]:
#find number of developed country,developing country,under-developed country
under_developing=df_copy[df_copy['preds']==2]['country']
developing=df_copy[df_copy['preds']==0]['country']
developed=df_copy[df_copy['preds']==1]['country']

print("Number of deveoped countries",len(under_developing))
print("Number of developing countries",len(developing))
print("Number of under-developing countries",len(developed))

In [None]:
len(developing)

In [None]:
developed