## Добрицкий Артем Юрьевич
### 8 группа, ТВиМС
### Кластерный анализ данных

<div align="center">
  <h2>Содержание</h2>
</div>

# 1. Загрузка и предобработка данных

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')

file = pd.read_csv("../data/Annual 2005-2011.csv")
display(file.head(10))

Unnamed: 0,empl_num,k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15,k16,k17,k18,k19,k20,year
0,6095.0,0.94238,0.060563,0.678302,-0.161531,0.202055,0.165019,0.399033,0.799019,5.426569,0.209235,1.115922,1.082798,0.655937,4.454819,3.975687,0.892446,1007.536232,0.076738,0.055049,0.034904,5.0
1,255.0,1.980494,0.274382,0.916775,0.624425,0.089377,0.220648,0.0,0.933519,14.041958,0.215083,1.259382,1.123828,0.705951,10.618881,12.295547,1.157895,357.294118,0.116068,0.05974,0.025647,5.0
2,114.0,0.37416,0.001494,0.085138,-1.50499,0.235739,0.508929,0.888889,0.779049,5.017007,0.096737,0.774586,1.185374,0.123415,0.794785,6.258929,7.875,36.894737,-0.584879,0.010563,0.0,5.0
3,365.0,7.859079,0.831978,2.449864,0.875862,0.059439,0.03003,0.011111,0.94201,16.244444,0.876663,1.223284,1.309449,2.804607,48.363889,26.142643,0.540541,33.676983,0.171731,0.496295,0.312415,5.0
4,168.0,1.779376,0.005596,0.883293,0.527853,0.135491,0.886686,0.489796,0.887341,10.558673,0.313389,0.874381,0.994832,0.473041,5.628827,3.125354,0.555241,19.103896,0.064809,0.025726,0.011839,5.0
5,6969.0,4.200293,1.310167,2.355742,0.767058,0.067142,0.043515,0.001988,0.934306,14.222452,0.39279,1.18941,1.081892,0.958338,14.58827,13.651105,0.935759,84.654633,0.164268,0.140701,0.098621,5.0
6,1754.0,0.835976,0.008003,0.18223,-0.200464,0.488237,0.427889,0.090158,0.516995,2.298864,0.673216,1.185475,1.058383,0.989701,4.400794,9.688993,2.201647,6.725695,0.041624,0.035402,0.043877,5.0
7,146.0,4.096916,1.193833,1.823789,0.683871,0.036512,0.321678,0.271889,0.964689,37.013825,0.125744,1.558559,1.015167,0.519457,19.930876,30.244755,1.517483,22.763158,0.084162,0.037833,0.017057,5.0
8,1674.0,2.194307,0.112153,0.740924,0.554451,0.19373,0.023384,0.078174,0.810724,5.873536,0.738569,0.989558,1.001019,1.829401,13.253646,14.566474,1.099054,19.493671,0.060137,0.117208,0.091945,5.0
9,2455.0,1.2158,0.063003,0.413992,0.089936,0.341383,0.057987,0.04119,0.660346,4.521441,0.595456,0.894957,1.017691,1.188855,8.140185,10.115851,1.242705,27.631543,0.102509,0.088303,0.062603,5.0


# 2. Подготовка данных

In [2]:
numeric_cols = file.select_dtypes(include=[np.number]).columns
print(f"Найдено числовых колонок: {len(numeric_cols)}")
feature_columns = [col for col in numeric_cols if file[col].nunique() > 10]
print(f"Выбрано колонок для анализа: {len(feature_columns)}")

X = file[feature_columns].copy()
X = X.fillna(X.mean())
print(f"Размерность матрицы признаков: {X.shape}")

Найдено числовых колонок: 22
Выбрано колонок для анализа: 21
Размерность матрицы признаков: (2695, 21)


# 3. Кластеризация методом k-средних
Стандартизируем данные и приступаем к кластеризации

In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=4, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)
cluster_centers = kmeans.cluster_centers_

file['cluster_original'] = cluster_labels

print("Кластеризация завершена")
print(f"Метки кластеров: {np.unique(cluster_labels)}")

Кластеризация завершена
Метки кластеров: [0 1 2 3]


# 4. Перенумерация кластеров
Расчет интегрального показателя

In [4]:
X_normalized = (X - X.mean()) / X.std()
weights = np.ones(len(feature_columns)) / len(feature_columns)
file['credit_score'] = X_normalized.dot(weights)
print("Интегральный показатель кредитоспособности рассчитан")
file['credit_score']

Интегральный показатель кредитоспособности рассчитан


0      -0.154192
1      -0.062389
2      -0.538522
3       1.022190
4      -0.050420
          ...   
2690    0.153366
2691    0.012153
2692    0.306671
2693    0.041131
2694    0.315757
Name: credit_score, Length: 2695, dtype: float64

Расчитываем средние значения кредитоспособности для каждого кластера

In [5]:
cluster_means = []
for i in range(4):
    mask = file['cluster_original'] == i
    mean_score = file.loc[mask, 'credit_score'].mean()
    cluster_means.append((i, mean_score))

print("Средние значения кредитоспособности по исходным кластерам:")
for cluster_num, mean_score in cluster_means:
    print(f"Кластер {cluster_num}: {mean_score:.4f}")

Средние значения кредитоспособности по исходным кластерам:
Кластер 0: 0.5666
Кластер 1: -0.2409
Кластер 2: -0.0721
Кластер 3: 0.1737


Далее мы сортируем кластеры в зависимости от значения кредитоспособности

In [6]:
cluster_means.sort(key=lambda x: x[1], reverse=True)
cluster_mapping = {old: new for new, (old, _) in enumerate(cluster_means)}

Расчитываем средние значения кредитоспособности для каждого исходного кластера

In [7]:
cluster_stats = []
for i in range(4):
    mask = file['cluster_original'] == i
    mean_score = file.loc[mask, 'credit_score'].mean()
    count = mask.sum()
    cluster_stats.append({
        'original_cluster': i,
        'mean_credit_score': mean_score,
        'count': count
    })
    cluster_stats_sorted = sorted(cluster_stats, key=lambda x: x['mean_credit_score'], reverse=True)

print("Средние значения кредитоспособности по исходным кластерам (до перенумерации):")
for stat in cluster_stats:
    print(f"Исходный кластер {stat['original_cluster']}: среднее = {stat['mean_credit_score']:.4f}, наблюдений = {stat['count']}")

Средние значения кредитоспособности по исходным кластерам (до перенумерации):
Исходный кластер 0: среднее = 0.5666, наблюдений = 229
Исходный кластер 1: среднее = -0.2409, наблюдений = 752
Исходный кластер 2: среднее = -0.0721, наблюдений = 1002
Исходный кластер 3: среднее = 0.1737, наблюдений = 712


Обновляем центры кластеров в соответствии с новой нумерацией

In [8]:
print("\nПосле сортировки по убыванию кредитоспособности:")
cluster_mapping = {}
for new_num, stat in enumerate(cluster_stats_sorted):
    old_num = stat['original_cluster']
    cluster_mapping[old_num] = new_num
    print(f"Новый кластер {new_num} (бывший {old_num}): среднее = {stat['mean_credit_score']:.4f}")
file['cluster'] = file['cluster_original'].map(cluster_mapping)
new_centers = np.zeros_like(cluster_centers)
for old_num, new_num in cluster_mapping.items():
    new_centers[new_num] = cluster_centers[old_num]
cluster_centers = new_centers


После сортировки по убыванию кредитоспособности:
Новый кластер 0 (бывший 0): среднее = 0.5666
Новый кластер 1 (бывший 3): среднее = 0.1737
Новый кластер 2 (бывший 2): среднее = -0.0721
Новый кластер 3 (бывший 1): среднее = -0.2409


# 5. Результаты

In [9]:
cluster_counts = file['cluster'].value_counts().sort_index()
total_observations = len(file)

table7 = pd.DataFrame({
    'Номер кластера': cluster_counts.index,
    'Количество наблюдений': cluster_counts.values,
    'Относительная частота (%)': (cluster_counts.values / total_observations * 100).round(2)
})
display(table7.round(3), "Распределение наблюдений по кластерам")

Unnamed: 0,Номер кластера,Количество наблюдений,Относительная частота (%)
0,0,229,8.5
1,1,712,26.42
2,2,1002,37.18
3,3,752,27.9


'Распределение наблюдений по кластерам'

In [10]:
stats_list = []
for cluster_num in range(4):
    cluster_data = file[file['cluster'] == cluster_num]['credit_score']

    stats = {
        'Кластер': cluster_num,
        'Число наблюдений': len(cluster_data),
        'Среднее': cluster_data.mean().round(4),
        'Медиана': cluster_data.median().round(4),
        'Минимум': cluster_data.min().round(4),
        'Максимум': cluster_data.max().round(4),
        'Стандартное отклонение': cluster_data.std().round(4)
    }
    stats_list.append(stats)

table8 = pd.DataFrame(stats_list)
display(table8.round(3), "Описательная статистика для интегральных показателей")

Unnamed: 0,Кластер,Число наблюдений,Среднее,Медиана,Минимум,Максимум,Стандартное отклонение
0,0,229,0.567,0.521,0.027,2.345,0.35
1,1,712,0.174,0.135,-0.269,1.237,0.238
2,2,1002,-0.072,-0.089,-0.58,1.156,0.176
3,3,752,-0.241,-0.22,-1.598,0.4,0.221


'Описательная статистика для интегральных показателей'

Проверяем монотонность убывания показателей

In [11]:
means = table8['Среднее'].values
if all(means[i] >= means[i+1] for i in range(len(means)-1)):
    print("✓ Средние значения монотонно убывают с увеличением номера класса")
else:
    print("✗ Средние значения НЕ монотонно убывают")

✓ Средние значения монотонно убывают с увеличением номера класса


Преобразуем центы кластеров обратно в исходный масштаб

In [12]:
cluster_centers_original = scaler.inverse_transform(cluster_centers)

table9 = pd.DataFrame(
    cluster_centers_original,
    columns=feature_columns,
    index=[f'Кластер {i}' for i in range(4)]
)
display(table9.round(3), "Координаты центров полученных кластеров")

Unnamed: 0,empl_num,k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15,k16,k17,k18,k19,k20
Кластер 0,1340.515,6.045,1.418,3.078,0.678,0.124,0.187,0.052,0.876,19.266,0.803,1.297,1.346,1.202,21.552,15.029,0.761,640.845,0.118,0.151,0.118
Кластер 1,1352.187,1.88,0.204,0.802,0.218,0.381,0.154,0.079,0.619,4.316,1.295,1.529,1.575,1.85,11.974,13.513,1.44,211.445,0.115,0.169,0.188
Кластер 2,1354.993,1.795,0.13,0.641,0.188,0.225,0.25,0.162,0.777,8.671,0.461,1.261,1.243,0.991,9.431,16.201,2.184,191.453,0.051,0.049,0.032
Кластер 3,881.041,1.163,0.055,0.405,-0.527,0.544,0.318,0.317,0.462,3.081,0.895,1.227,1.285,0.869,5.175,9.436,3.199,70.943,-0.016,0.012,-0.023


'Координаты центров полученных кластеров'