In [1]:
!pip install tensorflow==2.11.0
!pip install git+https://github.com/VenkateshwaranB/stellargraph.git

Collecting tensorflow==2.11.0
  Downloading tensorflow-2.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.1 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.11.0)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting keras<2.12,>=2.11.0 (from tensorflow==2.11.0)
  Downloading keras-2.11.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting protobuf<3.20,>=3.9.2 (from tensorflow==2.11.0)
  Downloading protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (787 bytes)
Collecting tensorboard<2.12,>=2.11 (from tensorflow==2.11.0)
  Downloading tensorboard-2.11.2-py3-none-any.whl.metadata (1.9 kB)
Collecting tensorflow-estimator<2.12,>=2.11.0 (from tensorflow==2.11.0)
  Downloading tensorflow_estimator-2.11.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting google-auth-oauthlib<0.5,>=0.4.1 (from tensorboard<2.12,>=2.11->tensorflow==2.11.0)
  Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl.metadata (2.7

In [2]:
import networkx as nx
import pandas as pd
import numpy as np
import os
import random
from sklearn.cluster import KMeans
from tensorflow import keras

import stellargraph as sg
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification
from stellargraph.data import UnsupervisedSampler
from stellargraph.mapper import GraphSAGENodeGenerator

import warnings
warnings.filterwarnings("ignore")

##### COMBINING ADJACENCY AND ATTRIBUTES DATA TO GIVE AS AN INPUT GRAPH

In [3]:
adjacency = pd.read_csv("/kaggle/input/da324dataminingproject2/adjacency.csv")
attributes = pd.read_excel('/kaggle/input/da324dataminingproject2/attributes.xlsx', sheet_name='in')
Graphx = nx.from_numpy_array(adjacency.to_numpy())
Graph = sg.StellarGraph(Graphx, node_features=attributes)

##### MAKING THE POSITIVE AND NEGATIVE NODE PAIRS USING RANDOM WALK

In [4]:
nodes = list(Graph.nodes())
number_of_walks = 1
length = 5
batch_size = 50
epochs = 4
num_samples = [10, 5]

In [7]:
unsupervised_samples = UnsupervisedSampler(
    Graph, nodes=nodes, length=length, number_of_walks=number_of_walks
)
generator = GraphSAGELinkGenerator(Graph, batch_size, num_samples)
train_gen = generator.flow(unsupervised_samples)

##### USING THE INBUILT GRAPHSAGE MODEL

In [8]:
layer_sizes = [50, 50]
graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.0, normalize="l2")
x_input, x_output = graphsage.in_out_tensors()
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_output)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [10]:
model = keras.Model(inputs=x_input, outputs=prediction)

model.compile(
    optimizer = keras.optimizers.Adam(lr=0.001),
    loss = keras.losses.binary_crossentropy,
    metrics = [keras.metrics.binary_accuracy],
)

##### TRAINING THE GRAPHSAGE MODEL ON THE NODE PAIRS

In [11]:
history = model.fit(
    train_gen,
    epochs=epochs,
    verbose=1,
    use_multiprocessing=False,
    workers=4,
    shuffle=True,
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


##### EXTRACTING THE EMBEDDINGS

In [12]:
x_input_src = x_input[0::2]
x_output_src = x_output[0]
embedding_model = keras.Model(inputs=x_input_src, outputs=x_output_src)
node_generator = GraphSAGENodeGenerator(Graph, batch_size, num_samples).flow(pd.Index(np.arange(0, 11952)))
node_embeddings = embedding_model.predict(node_generator, workers=4, verbose=1)



In [13]:
embeddings = pd.DataFrame(node_embeddings)

##### APPYLING KMEANS

In [14]:
seed = pd.read_excel('/kaggle/input/da324dataminingproject2/seed.xlsx', sheet_name='in')
seed.columns = range(len(seed.columns))

# making the column headings as a row
all_seeds = pd.DataFrame(seed.columns).T
all_seeds = pd.concat([all_seeds, seed], axis=0)
all_seeds = all_seeds.reset_index(drop = True)
all_seeds.columns = ["First", "Second", "Third"]

In [16]:
# calculating initial centroids from given seeds' centroid
centroids = np.zeros((10, embeddings.shape[1]))
for ind, row in all_seeds.iterrows():
    centroids[ind] = (embeddings.iloc[row['First'], :] +  embeddings.iloc[row['Second'], :] +  embeddings.iloc[row['Third'], :])/3

In [17]:
kmeans = KMeans(n_clusters=10, init=centroids, n_init=1, random_state=0)
labels = kmeans.fit_predict(embeddings)
embeddings['cluster'] = labels

##### FINAL SUBMISSION FILE

In [18]:
embeddings.reset_index(inplace=True)
submission_labels = embeddings[['index', 'cluster']].rename(columns={'index': 'ID', 'cluster': 'LABEL'})
submission_labels.to_csv('submission.csv', index=False)