# Topics
1. Importing Libraries
2. Data reading and cleaning
3. Analysis and Visualization
    - Do the varaible have dependency on one another?
    - Which are the countries which have high GDP and Internet usage?
4. Clustering

# Importing Libraries

In [None]:
# Libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings('ignore')

# Data reading and cleaning

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Reading the data set

df = pd.read_csv('/kaggle/input/gapminder-internet/internet_gapminder.csv')
df.head()

In [None]:
# Info

df.info()

- "internetuserate" is in object type we need to convert it into numeric type
- Also we will round all the Numeric columns upto 2 decimals places

In [None]:
# Chaninging the dtype of internetuserate

df['internetuserate'] = pd.to_numeric(df['internetuserate'], errors='coerce')

# We will use errors='coerce' here as there some rows in which are blank and are considered as strings

In [None]:
# Rounding the numeric columns upto 2 decimals places

df[['incomeperperson', 'internetuserate', 'urbanrate']] = df[['incomeperperson','internetuserate', 'urbanrate']].round(2)

In [None]:
print('Shape of data {0}:'.format(df.shape))

In [None]:
# Statistical Summary

df.describe()

In [None]:
# Lets see the missing values

round(100*(df.isnull().sum()/len(df)),2)

___Note___:
We will use iterative imputer to impute missing values as the percentage of missing values is less

In [None]:
# Imputing Afghanistan's incomeperperson with 499.44(google)
# https://www.statista.com/statistics/262052/gross-domestic-product-gdp-per-capita-in-afghanistan/

# Doing it so cause it will be considered 0 in iterative imputer as it is the first value to be missing and the 
# imputation will vary a lot


df.iloc[0,1] = 499.44

In [None]:
# Subsetting the data

df_country = df['country']
df_numeric = df.select_dtypes(include = ['float64'])

In [None]:
# Storing the values of the column

df_numeric_columns = df_numeric.columns

In [None]:
# Since the missing values are less we will use iterative imputer to impute the missing values

ii = IterativeImputer(initial_strategy='mean',min_value=50,random_state = 42)
iu_df = pd.DataFrame(ii.fit_transform(df_numeric))
iu_df.columns = df_numeric_columns
iu_df = iu_df.round(2)

In [None]:
# Concatinating the data

geo_df = pd.concat([df_country, iu_df], axis = 1)
geo_df.head()

# Analysis and Visualization

### Do the varaible have dependency on one another?

In [None]:
sns.heatmap(geo_df.corr(), annot = True, cmap = 'YlGnBu')
plt.show()

In [None]:
# Let's compare

figure = plt.figure(figsize=(15,5))
cat = ['internetuserate', 'urbanrate']

c = 1
for i in cat:
    plt.subplot(1,2,c)
    plt.title('incomeperperson vs {0}'.format(i), fontsize = 15)
    sns.scatterplot(x = 'incomeperperson', y = i, data = geo_df, color = 'green')
    c += 1 
    
figure.tight_layout(pad = 2)
plt.show()

In [None]:
sns.scatterplot(x = 'internetuserate', y = 'urbanrate', data = geo_df, color = 'green')
plt.show()

In [None]:
figure = plt.figure(figsize=(15,5))
dist_cat = ['incomeperperson','internetuserate', 'urbanrate']

c = 1
for i in dist_cat:
    plt.subplot(1,3,c)
    sns.distplot(geo_df[i])
    c = c +1
    
plt.show()

**Inference**:
- Income per person is postively corelated with internet use rate with 0.79: Basically the countries which have high income will have internet usage. It can also be seen in scatterplot
- Income per person and internet use rate is right skewwed
- Urban rate seems to be noramlly distributed
- People living in urban area have high internet usage and it can be seen with a correaltion of 0.61

### Which are the countries which have high GDP and Internet usage?

In [None]:
#Subsetting data for high income and low income countires

income_high = geo_df[['country', 'incomeperperson']].sort_values(by = 'incomeperperson', ascending = False).head(10)
income_low = geo_df[['country', 'incomeperperson']].sort_values(by = 'incomeperperson').head(10)

In [None]:
# Plotting 

plt.figure(figsize = (15,8))
plt.suptitle('Top 10', fontsize = 25)

plt.subplot(121)
sns.barplot(x = 'country', y = 'incomeperperson', data = income_high, palette = 'Set1')
plt.title('High Income Countries', fontsize = 20)
plt.xlabel('Country', fontsize = 15)
plt.ylabel('GDP',fontsize = 15)
plt.xticks(rotation = 'vertical')

plt.subplot(122)
sns.barplot(x = 'country', y = 'incomeperperson', data = income_low, palette = 'Set1')
plt.title('Low Income Countries', fontsize = 20)
plt.xlabel('Country', fontsize = 15)
plt.ylabel('GDP',fontsize = 15)
plt.xticks(rotation = 'vertical')

plt.show()

- Monaco has the highest GDP
- Most of lowest GDP countries are from Africa continent, this are Countries in war zones or with poor sanitation systems, diseases or very poor infrastructure

In [None]:
#Subsetting data for high internet usage and low internet usage

internet_high = geo_df[['country', 'internetuserate']].sort_values(by = 'internetuserate', ascending = False).head(10)
internet_low = geo_df[['country', 'internetuserate']].sort_values(by = 'internetuserate').head(10)

In [None]:
# Plotting 

plt.figure(figsize = (15,8))
plt.suptitle('Top 10', fontsize = 25)

plt.subplot(121)
sns.barplot(x = 'country', y = 'internetuserate', data = internet_high, palette = 'Set1')
plt.title('High Internet Usage Countries', fontsize = 20)
plt.xlabel('Country', fontsize = 15)
plt.ylabel('Internet Usage',fontsize = 15)
plt.xticks(rotation = 'vertical')

plt.subplot(122)
sns.barplot(x = 'country', y = 'internetuserate', data = internet_low, palette = 'Set1')
plt.title('Low Income Countries', fontsize = 20)
plt.xlabel('Country', fontsize = 15)
plt.ylabel('Internet Usage',fontsize = 15)
plt.xticks(rotation = 'vertical')

plt.show()

- Most of the European and developed nation have high internet usage
- Similiarly as seen in the GDP section most of the African nations have low internet usage

# Clustering

## Let's ask this to ourselves,
## How identify group of countires which need help in terms of GDP?

In [None]:
geo_df.head()

In [None]:
# Subsetting the data

country = geo_df['country']
num_df = geo_df[['incomeperperson', 'internetuserate', 'urbanrate']]

In [None]:
# Scaling the data

scaler = StandardScaler()
scaled_df = scaler.fit_transform(num_df)

In [None]:
# Getting it back to Data Frame

scaled_df = pd.DataFrame(scaled_df)
scaled_df.columns = geo_df.columns[1:]

In [None]:
#Calculating the Hopkins statistic

from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
round(hopkins(scaled_df),2)

 - Most of the values from Hopkins statisitc are coming from 0.64 to 0.75 approx.
 - As the value is good, it means good clusters are formed

In [None]:
# Changing name

cluster_df = scaled_df

In [None]:
#Elbow curve method

ssd = []
for k in range(2,10):
    kmeans = KMeans(n_clusters = k).fit(cluster_df)
    ssd.append([k, kmeans.inertia_])
plt.plot(pd.DataFrame(ssd)[0], pd.DataFrame(ssd)[1])
plt.grid()
plt.show()

We can see the elbow point at 3 and we will go with 3 clusters

In [None]:
#Let's perform K means using K=3

model_kmean = KMeans(n_clusters = 3, random_state = 50)
model_kmean.fit(cluster_df)

In [None]:
# Let's add the cluster Ids to the main data 

data = pd.concat([country, num_df, pd.Series(model_kmean.labels_)], axis = 1)
data.rename(columns = {0:'Cluster_ID'}, inplace = True)
data.head()

In [None]:
# How many units are there in each category

data['Cluster_ID'].value_counts()

In [None]:
# Plotting

data['Cluster_ID'].value_counts().plot(kind = 'barh', color = 'purple')
plt.show()

In [None]:
# Filtering the outliers in the cluster group 2

cluster_2 = data.loc[data['Cluster_ID'] == 2].sort_values(by =['incomeperperson', 'internetuserate'], ascending = False )[:5].reset_index(drop = True)

In [None]:
# Plot the Cluster with respect to the clusters obtained

plt.figure(figsize = (15,8))
sns.scatterplot(x = 'incomeperperson', y = 'internetuserate', hue = 'Cluster_ID', s=200, data = data, palette='Set1')


# Marking out the outlier countries
txt_2 = cluster_2['country'].tolist()
for i, txt in enumerate(txt_2):
    plt.annotate(txt, (cluster_2.incomeperperson[i]+1, cluster_2.internetuserate[i]+5))
    
    
plt.show()

- We can note that 3 different group of clusters are formed
- We can see the outlier countries from cluster group 2
- Outlier countries from cluster group 2 are 'Monaco', 'Liechtenstein', 'Bermuda', 'Norway', and 'Luxembourg'

### Analysis on Cluster's formed

 Since we are focusing on GDP lets look at GDP across different cluster groups

In [None]:
# Plotting GDP per cluster Group

data['incomeperperson'].groupby(data['Cluster_ID']).median().plot(kind = 'bar',figsize=(15,8), color = 'orange')

plt.title('GDP per Cluster Group', fontsize = 20)
plt.xlabel('Cluster Group', fontsize = 15)
plt.ylabel('GDP', fontsize = 15)
plt.xticks(rotation = 'horizontal',fontsize = 15)
plt.yticks(fontsize = 15)
plt.show()

In [None]:
# Individually checking the clusters across different catogry

plt.figure(figsize = (18,8))
var = ['incomeperperson', 'internetuserate', 'urbanrate']
for i in enumerate(var):
    plt.subplot(1,3,i[0]+1)
    sns.boxplot(x= 'Cluster_ID', y = i[1], data = data)

**Conculsion**:
To conclude with cluster analysis one can say that we need to focus on Cluster Group 0 to help in terms of GDP

<h1>Upvote if you like my work❤️<br>
If you have any queries, doubt or any suggestion feel free to drop it in comment section<h1>