In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.sparse.linalg import eigsh
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import random

import warnings
warnings.filterwarnings("ignore")

##### SEED DATA

In [None]:
seed = pd.read_excel('/kaggle/input/da324dataminingproject2/seed.xlsx', sheet_name='in')
seed.columns = range(len(seed.columns))

# making the column headings as a row
all_seeds = pd.DataFrame(seed.columns).T
all_seeds = pd.concat([all_seeds, seed], axis=0)
all_seeds = all_seeds.reset_index(drop = True)
all_seeds.columns = ["First", "Second", "Third"]

##### ATTRIBUTES DATA

In [None]:
attributes = pd.read_excel('/kaggle/input/da324dataminingproject2/attributes.xlsx', sheet_name='in')

# normalize the attributes
attribute_normalized = attributes.apply(lambda x: x / np.linalg.norm(x))
attribute_normalized.columns = [f"PC{i+1}" for i in range(attribute_normalized.shape[1])]

##### ADJACENCY DATA

In [None]:
adjacency = pd.read_csv("/kaggle/input/da324dataminingproject2/adjacency.csv")

# USE THE BELOW COMMENTED CODE FOR THE OLD ADJACENCY DATA

# adjacency = pd.read_excel('/kaggle/input/da324dataminingproject2/adjacency.xlsx', sheet_name='in')
# def clean_data(row):
#     row = row.split("\n")
#     nodes = []
#     for node in row:
#         if node == '  :\t:':
#             continue
#         nodes.append(int(node[6:-5]))  
#     return nodes
# adjacency["nodes"] = adjacency.iloc[:, 0].apply(clean_data)

# adjacecny_matrix = np.zeros((11952, 11952))
# for node1, row in adjacency.iterrows():
#     for node2 in row["nodes"]:
#         adjacecny_matrix[node1, node2] = 1

In [None]:
# calculating the laplacian matrix
adjacecny_matrix = adjacency.to_numpy()
degree_matrix = np.diag(np.sum(adjacecny_matrix, axis=1))
laplacian_matrix = degree_matrix - adjacecny_matrix
laplacian_matrix = laplacian_matrix.astype(float)

In [None]:
# calculating the normalized laplacian matrix
degree_inverse_matrix = np.zeros(degree_matrix.shape)
for node in range(11952):
    deg_inverse_matrix[node, node] = 1/degree_matrix[node, node]
normalized_laplacian_matrix = degree_inverse_matrix @ laplacian_matrix

In [None]:
# calculating the 10 smallest eigenvectors of normalized laplacian matrix
eigenvalues, eigenvectors = eigsh(normalized_laplacian_matrix, k=10, which='SM')
eigenvectors = pd.DataFrame(eigenvectors, columns=[f"col_{i+1}" for i in range(eigenvectors.shape[1])])

# normalizing the eigenvectors
eigenvectors = eigenvectors.apply(lambda x: x / np.linalg.norm(x))

##### Concatentating Adjacency and Attributes data to get final embeddings

In [None]:
embeddings = pd.concat([attribute_normalized, eigenvectors], axis=1)
embeddings = (embeddings - embeddings.mean()) / embeddings.std()
pca = PCA(n_components=0.9)  
final_embeddings = pca.fit_transform(embeddings)
final_embeddings = pd.DataFrame(data=final_embeddings, columns=[f"PC{i+1}" for i in range(final_embeddings.shape[1])])

##### APPLYING KMEANS CLUSTERING

In [None]:
# calculating initial centroids from given seeds' centroid
centroids = np.zeros((10, final_embeddings.shape[1]))
for index, row in all_seeds.iterrows():
    centroids[index] = (final_embeddings.iloc[row['First'], :] +  final_embeddings.iloc[row['Second'], :] +  final_embeddings.iloc[row['Third'], :])/3

In [None]:
# applying kmeans
kmeans = KMeans(n_clusters=10, init=centroids, n_init=1, random_state=0)
labels = kmeans.fit_predict(final_embeddings)
final_embeddings['cluster'] = labels

##### FINAL SUBMISSION FILE

In [None]:
final_embeddings.reset_index(inplace=True)
submission_labels = final_embeddings[['index', 'cluster']].rename(columns={'index': 'ID', 'cluster': 'LABEL'})
submission_labels.to_csv('submission.csv', index=False)