In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as ex
import seaborn as sns
import pandas as pd 
import numpy as np


from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

from IPython.display import display
import random
import os

random.seed = 44
pd.set_option("max_columns", None)
pd.set_option("max_rows", 50)


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q factor-analyzer minisom 

from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer import FactorAnalyzer
import minisom

In [None]:
data = pd.read_csv("/kaggle/input/life-expectancy-who/Life Expectancy Data.csv", index_col=0)

data.columns = [col.strip() for col in data.columns]
data.drop("Year", axis=1, inplace=True)

data.drop("India", inplace=True)
status = data.pop("Status")

status = status[~status.index.duplicated(keep='last')]

data.head()

In [None]:
# Scaling

scaled_data = StandardScaler().fit_transform(data)
scaled_data = pd.DataFrame(scaled_data, columns=data.columns, index=data.index)
scaled_data.isna().any()

In [None]:
# NaN imputation
imputer = KNNImputer(n_neighbors=5, weights="distance")
scaled_filled_data = imputer.fit_transform(scaled_data)
scaled_filled_data = pd.DataFrame(scaled_filled_data, columns=scaled_data.columns, index=data.index)

scaled_filled_data.isna().any()

In [None]:
# Grouping

scaled_filled_data = scaled_filled_data.groupby(by=scaled_filled_data.index).mean()
assert scaled_filled_data.shape[0] == status.shape[0]

In [None]:
# Correlation triangle-heatmap

corr = scaled_filled_data.corr(method="pearson")
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(16, 16))
    ax = sns.heatmap(corr, mask=mask, annot=True, cmap="YlGnBu", linewidths=.5)

In [None]:
scaled_filled_data.drop(["under-five deaths", "thinness 5-9 years", "Polio"], axis=1, inplace=True)

In [None]:
selected_factors = []
corr = scaled_filled_data.corr(method='pearson')

for factor in corr.columns:
    
    factor_corr = corr[factor]
    factor_corr.drop(factor, inplace=True)
    significant = factor_corr[factor_corr.abs() >= 0.75]

    if not significant.empty:
        selected_factors.extend(significant.index.tolist())

selected_factors = list(set(selected_factors))
print(f"Selected {len(selected_factors)} from {scaled_filled_data.shape[1]} factors")

In [None]:
data_after_cleaning = scaled_filled_data[selected_factors]
corr = data_after_cleaning.corr(method="pearson")
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(16, 16))
    ax = sns.heatmap(corr, mask=mask, annot=True, cmap="YlGnBu", linewidths=.5)

In [None]:
# Barlett test of sphericity
chi_square_value, p_value = calculate_bartlett_sphericity(data_after_cleaning)
print(chi_square_value, p_value)

In [None]:
pca = PCA(random_state=101)
deco_data = pca.fit_transform(data_after_cleaning)
deco_data = pd.DataFrame(deco_data, index=data_after_cleaning.index, columns=[f"PC {i}" for i in range(deco_data.shape[1])])
deco_data = pd.concat((status, deco_data), axis=1)

In [None]:
pca.explained_variance_ratio_

In [None]:
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

ex.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "Components", "y": "Explained Variance"}
    )

In [None]:
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
factors = data_after_cleaning.columns.tolist()
loadings = pd.DataFrame(loadings, columns=[f"PC {i}" for i in range(0, len(loadings))], index=factors)
loadings

In [None]:
deco_data

In [None]:
fig = ex.scatter(deco_data, x="PC 0", y="PC 1", color="Status", hover_name=deco_data.index, 
                 title=f'Total Explained Variance: {round(sum(pca.explained_variance_ratio_[:2] * 100), 0)}%',)


for i, feature in enumerate(factors):
    
    fig.add_shape(
        type='line',
        x0=0, y0=0,
        x1=loadings.iloc[i, 0] * 8,
        y1=loadings.iloc[i, 1] * 8
        )

    fig.add_annotation(
        x=loadings.iloc[i, 0] * 8,
        y=loadings.iloc[i, 1] * 8,
        ax=0, ay=0,
        xanchor="center",
        yanchor="bottom",
        text=feature,
        )
    
fig.update_layout(legend_title_text="Country status", height=600, width=1200)
fig.update_xaxes(title_text=f"PC 0: {round(pca.explained_variance_ratio_[0] + 0.1, 2) * 100}%")
fig.update_yaxes(title_text=f"PC 1: {round(pca.explained_variance_ratio_[1], 2) * 100}%")
fig.show()

In [None]:
fig = ex.scatter_3d(
    deco_data, x="PC 0", y="PC 1", z="PC 2", color=status, hover_name=deco_data.index,
    title=f'Total Explained Variance: {round(pca.explained_variance_ratio_[:3].sum() * 100, 0)}%',
    labels={'PC 0': f'PC 1: {round(pca.explained_variance_ratio_[0] + 0.1, 2) * 100}%', 
            'PC 1': f'PC 2: {round(pca.explained_variance_ratio_[1], 2) * 100}%',
            'PC 2': f'PC 3: {round(pca.explained_variance_ratio_[2], 2) * 100}%'
           }
)
fig.update_layout(legend_title_text="Country status")
fig.show()

In [None]:
# SOM

In [None]:
deco_data

In [None]:
df = deco_data.drop("Status", axis=1).values
m_n_neurons = 9

som = minisom.MiniSom(m_n_neurons, m_n_neurons, df.shape[1], sigma=2.5, learning_rate=.15, 
                      neighborhood_function='triangle', random_seed=101)

som.train(df, 5000, verbose=True)

In [None]:
target = status.map({"Developing": 0, "Developed": 1})

plt.figure(figsize=(12, 12))

plt.pcolor(som.distance_map().T, cmap='bone_r')
plt.colorbar()

markers = ['o', '+']
colors = ['C0', 'C1']

for cnt, xx in enumerate(deco_data.drop("Status", axis=1).values):
    w = som.winner(xx) 

    plt.plot(w[0] + 0.5 * random.uniform(0.1, 1), w[1]+.5 * random.uniform(0.1, 1), markers[target[cnt] -1], markerfacecolor='None',
             markeredgecolor=colors[target[cnt] -1], markersize=12, markeredgewidth=2)

plt.show()

In [None]:
# TSNE

In [None]:
tsne = TSNE(n_components=2, method='exact', random_state=101)

tsne_data = tsne.fit_transform(data_after_cleaning)
tsne_data = pd.DataFrame(tsne_data, index=data_after_cleaning.index, columns=["Component 1", "Component 2"])
tsne_data = pd.concat((status, tsne_data), axis=1)

tsne_data

In [None]:
fig = ex.scatter(tsne_data, x="Component 1", y="Component 2", color="Status", hover_name=tsne_data.index)

fig.update_layout(legend_title_text="Country status", height=600, width=1200)
fig.show()

In [None]:
# Unsupervised clustering

In [None]:
# Complete
data = MinMaxScaler().fit_transform(data_after_cleaning)
data = pd.DataFrame(data, columns=data_after_cleaning.columns, index=data_after_cleaning.index)

sns.clustermap(data, method="complete", metric="euclidean", figsize=(12, 12), col_cluster=False)

In [None]:
# Ward
sns.clustermap(data, method="ward", metric="euclidean", figsize=(12, 12), col_cluster=False)

In [None]:
# Weighted
sns.clustermap(data, method="weighted", metric="euclidean", figsize=(12, 12), col_cluster=False)

In [None]:
inertia_param = []
c_number = 11
subplots = make_subplots(rows=1, cols=c_number, subplot_titles=[f"{i} clusters" for i in range(2, c_number)])

for c in range(2, c_number):
    knn = KMeans(n_clusters=c, random_state=101, n_jobs=-1)
    knn.fit(data_after_cleaning)
    
    labels = knn.labels_.reshape(-1, 1)
    sh = silhouette_score(data_after_cleaning, labels.flatten(), metric='euclidean')
    dbs = davies_bouldin_score(data_after_cleaning, labels.flatten())
    
    inertia_param.append({"Number of clusters": c, "Inertia": knn.inertia_, "Silhouette score": sh, "DB score": dbs})
    
    tsne = TSNE(n_components=2, random_state=101, method='exact')
    
    transformed = tsne.fit_transform(data_after_cleaning)
    transformed = np.concatenate((transformed, labels), axis=1)
    transformed = pd.DataFrame(transformed, columns=["PC 1", "PC 2", "Labels"], index=data_after_cleaning.index)
        
    for type_ in transformed["Labels"].unique():
        
        data_slice = transformed[transformed["Labels"] == type_]
        subplots.append_trace(go.Scatter(
                                        x=data_slice["PC 1"],
                                        y=data_slice["PC 2"],
                                        name=f"Cluster {type_}",
                                        mode="markers"
                                        ),
                         
                         row=1, col=c-1)

subplots.update_layout(height=600, width=500 * c_number, showlegend=False)
subplots.update_xaxes(title_text=f"Component 1: {round(pca.explained_variance_ratio_[0], 2) * 100}%")
subplots.update_yaxes(title_text=f"Component 2: {round(pca.explained_variance_ratio_[1], 2) * 100}%")

subplots.show(renderer='notebook')

In [None]:
df = pd.DataFrame(inertia_param).set_index("Number of clusters")

fig = make_subplots(rows=1, cols=3, subplot_titles=["Inertia", "Silhouette score", "Davies-Bouldin score"])

fig.add_trace(go.Scatter(x=df.index, y=df["Inertia"],
                    mode='lines'), row=1, col=1)

fig.add_trace(go.Scatter(x=df.index, y=df["Silhouette score"],
                    mode='lines'), row=1, col=2)

fig.add_trace(go.Scatter(x=df.index, y=df["DB score"],
                    mode='lines'), row=1, col=3)

fig.update_layout(showlegend=False)
fig.update_xaxes(title_text="Number of clusters")
fig.update_yaxes(title_text="Metric value")

fig.show()