In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.sparse.linalg import eigsh
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import random

import warnings
warnings.filterwarnings("ignore")

##### SEED DATA

In [2]:
seed = pd.read_excel('/kaggle/input/da324dataminingproject2/seed.xlsx', sheet_name='in')
seed.columns = range(len(seed.columns))

# making the column headings as a row
all_seeds = pd.DataFrame(seed.columns).T
all_seeds = pd.concat([all_seeds, seed], axis=0)
all_seeds = all_seeds.reset_index(drop = True)
all_seeds.columns = ["First", "Second", "Third"]

##### ATTRIBUTES DATA

In [3]:
attributes = pd.read_excel('/kaggle/input/da324dataminingproject2/attributes.xlsx', sheet_name='in')

# standardize the attributes
attribute_standardized = (attributes - attributes.mean()) / attributes.std()

# apply pca
pca = PCA(n_components=0.9)  
attributes_pca = pca.fit_transform(attribute_standardized)
attributes_pca = pd.DataFrame(data=attributes_pca, columns=[f"PC{i+1}" for i in range(attributes_pca.shape[1])])

# normalize the attributes
attribute_normalized = attributes_pca.apply(lambda x: x / np.linalg.norm(x))

##### ADJACENCY DATA

In [4]:
adjacency = pd.read_csv("/kaggle/input/da324dataminingproject2/adjacency.csv")

# USE THE BELOW COMMENTED CODE FOR THE OLD ADJACENCY DATA

# adjacency = pd.read_excel('/kaggle/input/da324dataminingproject2/adjacency.xlsx', sheet_name='in')
# def clean_data(row):
#     row = row.split("\n")
#     nodes = []
#     for node in row:
#         if node == '  :\t:':
#             continue
#         nodes.append(int(node[6:-5]))  
#     return nodes
# adjacency["nodes"] = adjacency.iloc[:, 0].apply(clean_data)

# adjacecny_matrix = np.zeros((11952, 11952))
# for node1, row in adjacency.iterrows():
#     for node2 in row["nodes"]:
#         adjacecny_matrix[node1, node2] = 1

In [5]:
# calculating the laplacian matrix
adjacecny_matrix = adjacency.to_numpy()
degree_matrix = np.diag(np.sum(adjacecny_matrix, axis=1))
laplacian_matrix = degree_matrix - adjacecny_matrix
laplacian_matrix = laplacian_matrix.astype(float)

In [6]:
# calculating the 10 smallest eigenvectors of laplacian
eigenvalues, eigenvectors = eigsh(laplacian_matrix, k=10, which='SM')
eigenvectors = pd.DataFrame(eigenvectors, columns=[f"col_{i+1}" for i in range(eigenvectors.shape[1])])

##### Concatentating Adjacency and Attributes data to get final embeddings

In [7]:
embeddings = pd.concat([attribute_normalized, eigenvectors], axis=1)
embeddings = (embeddings - embeddings.mean()) / embeddings.std()
pca = PCA(n_components=0.9)  
final_embeddings = pca.fit_transform(embeddings)
final_embeddings = pd.DataFrame(data=final_embeddings, columns=[f"PC{i+1}" for i in range(final_embeddings.shape[1])])

In [8]:
# seperating the first 10952 rows for training
train_embeddings = final_embeddings.iloc[:10952, :]

##### APPLYING KMEANS CLUSTERING

In [9]:
# calculating initial centroids from given seeds' centroid
centroids = np.zeros((10, final_embeddings.shape[1]))
for index, row in all_seeds.iterrows():
    centroids[index] = (final_embeddings.iloc[row['First'], :] +  final_embeddings.iloc[row['Second'], :] +  final_embeddings.iloc[row['Third'], :])/3

In [10]:
# applying kmeans
kmeans = KMeans(n_clusters=10, init=centroids, n_init=1, random_state=0)
labels = kmeans.fit_predict(train_embeddings)
train_embeddings['cluster'] = labels

##### TRAINING A NEURAL NETWORK ON THE DATA

In [11]:
# dataloader class
class CustomDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx]

In [12]:
X_train = np.array(train_embeddings.iloc[:, :-1])
y_train = np.array(train_embeddings.iloc[:, -1])
X_test = np.array(final_embeddings.iloc[-1000:, :])

train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

test_dataset = CustomDataset(X_test)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [13]:
# model class
class Custom_model(torch.nn.Module):
    def __init__(self):
        super(Custom_model, self).__init__()
        self.fc1 = torch.nn.Linear(final_embeddings.shape[1], 128)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(128, 10)  

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [14]:
# defining the loss function and the optimizer
model = Custom_model()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [15]:
num_epochs = 30
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    if epoch%5 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.5939496159553528
Epoch 6, Loss: 0.09112799912691116
Epoch 11, Loss: 0.06277771294116974
Epoch 16, Loss: 0.00908169150352478
Epoch 21, Loss: 0.02355409786105156
Epoch 26, Loss: 0.0020169666968286037


##### PREDICTING THE LABELS OF REMAINING DATA

In [16]:
model.eval()
predictions = []

with torch.no_grad():
    for inputs in test_loader:
        outputs = model(inputs)
        _,predicted = torch.max(outputs.data, 1)
        predictions.extend(np.array(predicted.squeeze()))

In [17]:
final_embeddings["cluster"] = [0]*11952
final_embeddings.iloc[:10952, -1] = train_embeddings["cluster"]
final_embeddings.iloc[10952:, -1] = predictions

##### FINAL SUBMISSION FILE

In [18]:
final_embeddings.reset_index(inplace=True)
submission_labels = final_embeddings[['index', 'cluster']].rename(columns={'index': 'ID', 'cluster': 'LABEL'})
submission_labels.to_csv('submission.csv', index=False)