In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('/kaggle/input/world-happiness-report-2021/world-happiness-report-2021.csv', index_col=0)
df.head()

In [None]:
useful_cols = list(df.columns)[5:11]
useful_cols

In [None]:
df = df[useful_cols]
df.head()

In [None]:
X = StandardScaler().fit_transform(df)

In [None]:
# Применяем иерархическую кластеризацию с построением полного дерева
merging = linkage(X, method='ward')

# Строим дендрограмму
dendrogram(merging, labels=df.index, leaf_font_size=10)

fig = plt.gcf()
fig.set_size_inches(20, 20)
plt.show()

In [None]:
clusters = fcluster(merging, 12, criterion='distance')
print(clusters)

In [None]:
classes = dict(zip(list(df.index), clusters))
for k in (1, 2, 3):
    for c in classes:
        if classes[c] == k:
            print(c, end=' ')
    print('\n')

In [None]:
crit = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=15)
    kmeans.fit(X)
    crit.append(kmeans.inertia_)
    
plt.plot(range(2,11), crit)
plt.show()

In [None]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
labels = kmeans.predict(X)

In [None]:
classes = dict(zip(list(df.index), labels))
for k in (0, 1, 2, 3):
    for c in classes:
        if classes[c] == k:
            print(c, end=' ')
    print('\n')

In [None]:
df['cluster'] = kmeans.labels_
df.head()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(X)
Z = pca.transform(X)

In [None]:
print(Z)

In [None]:
pca.explained_variance_ratio_.cumsum()

In [None]:
plt.plot(Z[labels == 0, 0], Z[labels == 0, 1], 'bo', label='Cluster 1')
plt.plot(Z[labels == 1, 0], Z[labels == 1, 1], 'go', label='Cluster 2')
plt.plot(Z[labels == 2, 0], Z[labels == 2, 1], 'ro', label='Cluster 3')
plt.plot(Z[labels == 3, 0], Z[labels == 3, 1], 'yo', label='Cluster 4')
plt.legend(loc=0);

In [None]:
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

loading_matrix = pd.DataFrame(loadings, columns=['PC1', 'PC2'], index=df.columns[:-1])
loading_matrix

In [None]:
df.index[np.argmin(Z[:, 1])]

In [None]:
df.index[np.argmax(Z[:, 0])]

In [None]:
df.index[np.argmax(Z[:, 1])]

In [None]:
df.index[np.argmin(Z[:, 0])]

In [None]:
pca = PCA(n_components=3)
pca.fit(X)
print(pca.explained_variance_ratio_.cumsum())

In [None]:
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

loading_matrix = pd.DataFrame(loadings, columns=['PC1', 'PC2', 'PC3'], index=df.columns[:-1])
loading_matrix

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(random_state=1)
Z = tsne.fit_transform(X)
print(Z)

In [None]:
plt.plot(Z[labels == 0, 0], Z[labels == 0, 1], 'bo', label='Cluster 1')
plt.plot(Z[labels == 1, 0], Z[labels == 1, 1], 'go', label='Cluster 2')
plt.plot(Z[labels == 2, 0], Z[labels == 2, 1], 'ro', label='Cluster 3')
plt.plot(Z[labels == 3, 0], Z[labels == 3, 1], 'yo', label='Cluster 4')
plt.legend(loc=0);

In [None]:
df.index[np.argmin(Z[:, 1])]