In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/economic-data-for-top-50-world-economies-by-gdp/World_Economic_Data_Q1_2020_v1.csv')
main_data = data[['Stock_Market_Recovery','Lt_Yc','Mt_Yc','St_Yc','GDP_2019','Real_GDP_Growth','Unemployment','Population','Inflation','Food_Inflation','Interest','Current_Acct_To_GDP','Gold_Tons','Govt_Debt_To_GDP','Household_Debt_To_GDP','Income_Tax','Adjusted_Interest','Real_GDP','Household_Debt_To_Real_GDP','Govt_Debt_To_Real_GDP','Current_Acct_To_Real_GDP']]
print(main_data.head())

In [None]:
main_data.shape

In [None]:
main_data.describe()

In [None]:
data.dtypes

In [None]:
main_data.isnull().sum()

In [None]:
fig, corrmap = plt.subplots(figsize=(15,7))

pearsoncorr = main_data.corr(method='pearson')

corrmap = sns.heatmap(
              pearsoncorr, 
              xticklabels=pearsoncorr.columns,
              yticklabels=pearsoncorr.columns,
              cmap='RdBu_r',
              annot=True,
              linewidth=2.0
          )

plt.show()

In [None]:
output_data = data[['Country','Real_GDP_Growth','Inflation','Adjusted_Interest']]
cluster_data = main_data[['Real_GDP_Growth','Inflation','Adjusted_Interest']]

fig, axes = plt.subplots(1,3,sharex=True,figsize=(16,6))
fig.suptitle('Relationship Analysis for Cluster Features')
sns.set_style('whitegrid')

sns.regplot(ax=axes[0], x=cluster_data['Real_GDP_Growth'], y=cluster_data['Inflation'])
axes[0].set_title('Real GDP Growth vs. Inflation')

sns.regplot(ax=axes[1], x=cluster_data['Inflation'], y=cluster_data['Adjusted_Interest'])
axes[1].set_title('Inflation vs. Interest')

sns.regplot(ax=axes[2], x=cluster_data['Adjusted_Interest'], y=cluster_data['Real_GDP_Growth'])
axes[2].set_title('Interest vs. Real GDP Growth')

In [None]:
#Outlier search
cdata = cluster_data[(cluster_data['Adjusted_Interest'] > 30) & (cluster_data['Inflation'] > 40)]

#Outlier found at index -> 24
#Dropping index 24 from cluster data
cluster_data = cluster_data.drop(axis=0,index=24)
print(cluster_data)

In [None]:
fig, axes = plt.subplots(1,3,sharex=True,figsize=(16,6))
fig.suptitle('Relationship Analysis for Cluster Features')
sns.set_style('whitegrid')

sns.regplot(ax=axes[0], x=cluster_data['Real_GDP_Growth'], y=cluster_data['Inflation'])
axes[0].set_title('Real GDP Growth vs. Inflation')

sns.regplot(ax=axes[1], x=cluster_data['Inflation'], y=cluster_data['Adjusted_Interest'])
axes[1].set_title('Inflation vs. Interest')

sns.regplot(ax=axes[2], x=cluster_data['Adjusted_Interest'], y=cluster_data['Real_GDP_Growth'])
axes[2].set_title('Interest vs. Real GDP Growth')

In [None]:
sns.kdeplot(data=cluster_data['Real_GDP_Growth'], label='Real GDP Growth', shade=True)
sns.kdeplot(data=cluster_data['Inflation'], label='Inflation', shade=True)
sns.kdeplot(data=cluster_data['Adjusted_Interest'], label='Interest', shade=True)

In [None]:
scaler = StandardScaler() 
scaler.fit(cluster_data)

# Transforming training and test data
cluster_data = scaler.transform(cluster_data)

# K Means Clustering
kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300)
kmeans.fit(cluster_data)

# Finding Clusters
clusters = kmeans.cluster_centers_
print(clusters)

In [None]:
# Adjust values of clusters until convergence (recalculating position of clusters based on data points)
y_km = kmeans.fit_predict(cluster_data)
print(y_km)

output_data = output_data.drop(axis=0,index=24)

# New dataframe with cluster results
output = output_data.copy()
output['Cluster'] = y_km
output.sort_values(by=['Cluster'], inplace=True, ascending=True)
print(output)

In [None]:
fig2 = plt.figure(figsize=(14,6))
ax = fig2.add_subplot(111, projection='3d')

x = output['Inflation']
y = output['Adjusted_Interest']
z = output['Real_GDP_Growth']
c = output['Cluster']

img = ax.scatter(x, y, z, c=c, cmap='viridis')
fig2.colorbar(img)
plt.show()