# Лабораторная работа 5 : Кластерный анализ. Визуализация многомерных данных. Методы понижения размерности.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap

## Часть 1

In [None]:
V = 83
X1 = [40+V, 100-V, 30+V, 25+V, V, 100+V, 230-V, 110+V, 120+V, 180-V]
X2 = [10+V, 110-V, 20+V, 15+V, 105-V, 120+V,220-V, 90+V, 200-V, 160-V]
X = np.stack([X1, X2], axis = -1)

In [None]:
def plot1(model, name):
    for i in range(len(X)):
        if(model.labels_[i] == 0):
            plt.scatter(X1[i], X2[i], alpha = 0.8,s = 10, color = 'red')
        else:
            plt.scatter(X1[i], X2[i], alpha = 0.8,s = 10, color = 'green')
    plt.title(name, size = 20)
    plt.legend()

# https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
def plot2(model, name, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)
    plt.title(name, size = 20)

def plot3(model, name):
    plt.figure(figsize=(10, 10))
    plt.scatter(model[:,0], model[:,1], alpha = 0.8,s = 10)
    plt.title(name, size = 20)
    plt.legend()

In [None]:
clustering_complete = AgglomerativeClustering(affinity='euclidean', linkage = 'complete').fit(X)
clustering_single = AgglomerativeClustering(affinity='euclidean', linkage = 'single').fit(X)

In [None]:
plot1(clustering_complete, "Дальний сосед")

In [None]:
plot1(clustering_single, "Ближний сосед")

In [None]:
clustering_complete = AgglomerativeClustering(affinity='euclidean', linkage = 'complete', distance_threshold=0., n_clusters= None).fit(X)
clustering_single = AgglomerativeClustering(affinity='euclidean', linkage = 'single', distance_threshold=0., n_clusters= None).fit(X)

In [None]:
plot2(clustering_complete, "Дальний сосед", truncate_mode="level", p=3)

In [None]:
plot2(clustering_single, "Ближний сосед", truncate_mode="level", p=3)

## Часть 2

In [None]:
df = pd.read_csv("avocado.csv", parse_dates=True)
df = df.sample(frac=1)
df = df.reset_index()
df = df.rename(columns={"4046": "Small Hass",
               "4225": "Large Hass", "4770": "XLarge Hass"})
df = df.drop(columns=["Unnamed: 0", "index"])

stringValues = ['Date', 'type', 'region']
df[stringValues] = df[stringValues].apply(lambda x: pd.factorize(x)[0])

df

In [None]:
df.info()

In [None]:
X = df.dropna().reset_index().drop('AveragePrice', axis=1)

In [None]:
X = StandardScaler().fit_transform(X)

X_pca = PCA(n_components=2).fit_transform(X)

X_tsme = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3).fit_transform(X)

X_umap = umap.UMAP(n_neighbors=5, random_state=42).fit(X).embedding_

In [None]:
plot3(X_pca, "pca")

In [None]:
plot3(X_tsme, "tsme")

In [None]:
plot3(X_umap, "umap")