In [3]:
import numpy as np
import pandas as pd
import spektral
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loaded the content file

In [8]:
import pandas as pd

# Load .content file
def load_content(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None)
    print("DataFrame shape:", df.shape)
    print("First few rows of the DataFrame:")
    print(df.head())

    # Check the first row to determine how many features are there
    first_row = df.iloc[0].values
    num_features = len(first_row) - 2  # Assuming last column is label and first is ID
    column_names = ['node_id'] + [f'word_{i}' for i in range(num_features)] + ['label']
    df.columns = column_names

    return df

# Example file paths (adjust if necessary)
content_file = '/content/drive/MyDrive/citeseer.content'

# Load data
content_df = load_content(content_file)


DataFrame shape: (3312, 3705)
First few rows of the DataFrame:
     0     1     2     3     4     5     6     7     8     9     ...  3695  \
0  100157     0     0     0     0     0     0     0     0     0  ...     0   
1  100598     0     0     0     0     0     0     0     0     0  ...     0   
2  105684     0     1     0     0     0     0     0     0     0  ...     0   
3   11099     0     0     0     0     0     0     0     0     0  ...     0   
4  114091     0     0     0     0     0     0     0     0     0  ...     0   

   3696  3697  3698  3699  3700  3701  3702  3703    3704  
0     0     0     0     0     0     0     0     0  Agents  
1     0     0     0     0     0     0     0     0      IR  
2     0     0     0     0     0     0     0     0  Agents  
3     0     0     0     0     0     0     0     0      DB  
4     0     0     0     0     0     0     0     0      AI  

[5 rows x 3705 columns]


  df = pd.read_csv(file_path, sep='\t', header=None)


Processed the data and printed the output to verify

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load content file into DataFrame
def load_content(file_path):
    # Specify dtype for the paper ID column to avoid mixed types warning
    df = pd.read_csv(file_path, sep='\t', header=None, dtype={0: str})

    # Extract paper IDs, features, and labels
    paper_ids = df[0].values
    features = df.iloc[:, 1:-1].values
    labels = df.iloc[:, -1].values

    return paper_ids, features, labels

# Path to your content file
file_path = '/content/drive/MyDrive/citeseer.content'
paper_ids, features, labels = load_content(file_path)

# Encode labels to integers
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

# Print the first few entries to verify
print("Paper IDs:", paper_ids[:5])
print("Features shape:", features.shape)
print("Labels:", labels[:5])
print("Encoded Labels:", labels_encoded[:5])


Paper IDs: ['100157' '100598' '105684' '11099' '114091']
Features shape: (3312, 3703)
Labels: ['Agents' 'IR' 'Agents' 'DB' 'AI']
Encoded Labels: [1 4 1 2 0]


Loaded the cites dataset

In [10]:
import networkx as nx

def load_cites(file_path):
    G = nx.DiGraph()
    with open(file_path, 'r') as f:
        lines = f.readlines()

    for line in lines:
        cited, citing = line.strip().split()
        G.add_edge(citing, cited)

    return G

# Path to your cites file
cites_file_path = '/content/drive/MyDrive/citeseer.cites'
G = load_cites(cites_file_path)


Checked the no of papers in cites and contents file

In [3]:
import pandas as pd
import numpy as np
import networkx as nx

# Load content file into DataFrame
def load_content(file_path):
    # Specify dtype for the paper ID column to avoid mixed types warning
    df = pd.read_csv(file_path, sep='\t', header=None, dtype={0: str})

    # Extract paper IDs, features, and labels
    paper_ids = df[0].values
    features = df.iloc[:, 1:-1].values
    labels = df.iloc[:, -1].values

    return paper_ids, features, labels

# Load cites file and create a graph
def load_cites(file_path):
    G = nx.DiGraph()
    with open(file_path, 'r') as f:
        lines = f.readlines()

    for line in lines:
        cited, citing = line.strip().split()
        G.add_edge(citing, cited)

    return G

# Path to your content file
content_file_path = '/content/drive/MyDrive/citeseer.content'
paper_ids, features, labels = load_content(content_file_path)

# Path to your cites file
cites_file_path = '/content/drive/MyDrive/citeseer.cites'
G = load_cites(cites_file_path)

# Get unique paper IDs from the content file
paper_ids_from_content = set(paper_ids)

# Get unique paper IDs from the cites file
paper_ids_from_cites = set(G.nodes)

# Find discrepancies
missing_in_cites = paper_ids_from_content - paper_ids_from_cites
missing_in_content = paper_ids_from_cites - paper_ids_from_content

# Print the results
print(f"Number of papers in content file: {len(paper_ids_from_content)}")
print(f"Number of papers in cites file: {len(paper_ids_from_cites)}")
print(f"Missing in cites file: {missing_in_cites}")
print(f"Missing in content file: {missing_in_content}")

# If needed, check individual counts
print(f"Number of missing IDs in cites file: {len(missing_in_cites)}")
print(f"Number of missing IDs in content file: {len(missing_in_content)}")


Number of papers in content file: 3312
Number of papers in cites file: 3327
Missing in cites file: set()
Missing in content file: {'kohrs99using', 'hahn98ontology', '293457', 'ghani01hypertext', 'wang01process', 'gabbard97taxonomy', 'raisamo99evaluating', '38137', 'nielsen00designing', 'khardon99relational', 'flach99database', 'weng95shoslifn', 'tobies99pspace', '95786', '197556'}
Number of missing IDs in cites file: 0
Number of missing IDs in content file: 15


Removed the missing papers from the cites file

In [4]:
import pandas as pd
import numpy as np
import networkx as nx

# Load content file into DataFrame
def load_content(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, dtype={0: str})
    paper_ids = df[0].values
    features = df.iloc[:, 1:-1].values
    labels = df.iloc[:, -1].values
    return paper_ids, features, labels

# Load citation file into a directed graph
def load_cites(file_path):
    G = nx.DiGraph()
    with open(file_path, 'r') as f:
        lines = f.readlines()
    for line in lines:
        cited, citing = line.strip().split()
        G.add_edge(citing, cited)
    return G

# Identify missing IDs in content file
def identify_missing_ids(content_ids, cites_graph):
    cites_ids = set(cites_graph.nodes())
    content_ids_set = set(content_ids)
    missing_ids = cites_ids - content_ids_set
    return missing_ids

# Remove missing IDs from the citation graph
def remove_missing_ids_from_graph(graph, missing_ids):
    graph.remove_nodes_from(missing_ids)

# Paths to your files
content_file_path = '/content/drive/MyDrive/citeseer.content'
cites_file_path = '/content/drive/MyDrive/citeseer.cites'

# Load data
paper_ids, features, labels = load_content(content_file_path)
G = load_cites(cites_file_path)

# Identify missing IDs
missing_ids = identify_missing_ids(paper_ids, G)
print(f"Missing IDs: {missing_ids}")

# Remove missing IDs from the graph
remove_missing_ids_from_graph(G, missing_ids)

# Verify the changes
print(f"Updated number of nodes: {G.number_of_nodes()}")


Missing IDs: {'kohrs99using', 'hahn98ontology', '293457', 'ghani01hypertext', 'wang01process', 'gabbard97taxonomy', 'raisamo99evaluating', '38137', 'nielsen00designing', 'khardon99relational', 'flach99database', 'weng95shoslifn', 'tobies99pspace', '95786', '197556'}
Updated number of nodes: 3312


  Now we have to prepare the data
  Create adjacency matrix , feature matrix and labels for this purpose

In [3]:
import numpy as np
import networkx as nx
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the content (node features and labels)
def load_content(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, dtype={0: str})
    paper_ids = df[0].values
    features = df.iloc[:, 1:-1].values
    labels = df.iloc[:, -1].values
    return paper_ids, features, labels

# Load the citation graph
def load_cites(file_path):
    G = nx.DiGraph()
    with open(file_path, 'r') as f:
        lines = f.readlines()
    for line in lines:
        cited, citing = line.strip().split()
        G.add_edge(citing, cited)
    return G

# Encode labels
def encode_labels(labels):
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels)
    return labels_encoded

# Create adjacency matrix
def create_adjacency_matrix(graph, node_map):
    n = len(node_map)
    adjacency_matrix = np.zeros((n, n))
    for citing, cited in graph.edges():
        citing_idx = node_map[citing]
        cited_idx = node_map[cited]
        adjacency_matrix[citing_idx, cited_idx] = 1
    return adjacency_matrix

# Paths to your files
content_file_path = '/content/drive/MyDrive/citeseer.content'
cites_file_path = '/content/drive/MyDrive/citeseer.cites'

# Load data
paper_ids, features, labels = load_content(content_file_path)
G = load_cites(cites_file_path)

# Map paper IDs to node indices
node_map = {paper_id: idx for idx, paper_id in enumerate(paper_ids)}

# Remove missing nodes
missing_ids = identify_missing_ids(paper_ids, G)
remove_missing_ids_from_graph(G, missing_ids)

# Re-map node indices after removing missing nodes
node_map = {paper_id: idx for idx, paper_id in enumerate(G.nodes())}

# Create adjacency matrix
adjacency_matrix = create_adjacency_matrix(G, node_map)

# Encode labels
labels_encoded = encode_labels(labels)

# Ensure feature matrix, adjacency matrix, and labels are aligned
features = features[list(node_map.values())]
labels_encoded = labels_encoded[list(node_map.values())]

print("Adjacency Matrix:", adjacency_matrix.shape)
print("Feature Matrix:", features.shape)
print("Labels:", labels_encoded.shape)


Adjacency Matrix: (3312, 3312)
Feature Matrix: (3312, 3703)
Labels: (3312,)


In [16]:
!pip install torch torchvision torchaudio
!pip install torch-geometric


Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m667.4 kB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.5.3


In [18]:
import torch
from torch_geometric.data import Data

# Convert adjacency matrix to edge index format
def adjacency_to_edge_index(adj_matrix):
    adj_matrix = torch.tensor(adj_matrix, dtype=torch.float)
    edge_index = torch.nonzero(adj_matrix, as_tuple=False).t().contiguous()
    return edge_index

# Create a PyTorch Geometric Data object
def create_data_object(features, adjacency_matrix, labels):
    edge_index = adjacency_to_edge_index(adjacency_matrix)
    x = torch.tensor(features, dtype=torch.float)
    y = torch.tensor(labels, dtype=torch.long)

    data = Data(x=x, edge_index=edge_index, y=y)
    return data

# Prepare your data
data = create_data_object(features, adjacency_matrix, labels_encoded)


In [42]:
import torch
from torch_geometric.nn import GATConv

class GATModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GATModel, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=8, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * 8, out_channels, heads=1, dropout=0.6)
        self.fc = torch.nn.Linear(out_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.nn.functional.elu(x)
        x = self.conv2(x, edge_index)
        return self.fc(x)


I tried to use SGD with momentum but it resulted in an accuracy of 54.1%


I used RMSProp and it resulted in an accuracy of 94.8%

In [43]:
from torch_geometric.data import DataLoader

# Create a DataLoader for batching
data_loader = DataLoader([data], batch_size=1, shuffle=True)

# Instantiate the model, optimizer, and loss function
model = GATModel(in_channels=features.shape[1], hidden_channels=64, out_channels=len(np.unique(labels_encoded)))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
def train():
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
    return loss.item()

# Training the model
for epoch in range(200):  # Number of epochs
    loss = train()
    print(f'Epoch {epoch+1}, Loss: {loss}')


Epoch 1, Loss: 1.7988417148590088
Epoch 2, Loss: 1.788718581199646
Epoch 3, Loss: 1.7438023090362549
Epoch 4, Loss: 1.7235617637634277
Epoch 5, Loss: 1.5660966634750366
Epoch 6, Loss: 1.5311999320983887
Epoch 7, Loss: 1.48184335231781
Epoch 8, Loss: 1.485579490661621
Epoch 9, Loss: 1.455890417098999
Epoch 10, Loss: 1.4554641246795654
Epoch 11, Loss: 1.4280046224594116
Epoch 12, Loss: 1.4234076738357544
Epoch 13, Loss: 1.3884739875793457
Epoch 14, Loss: 1.361904263496399
Epoch 15, Loss: 1.3281476497650146
Epoch 16, Loss: 1.3286356925964355
Epoch 17, Loss: 1.3335047960281372
Epoch 18, Loss: 1.3508355617523193
Epoch 19, Loss: 1.3054454326629639
Epoch 20, Loss: 1.279623031616211
Epoch 21, Loss: 1.2830241918563843
Epoch 22, Loss: 1.2805991172790527
Epoch 23, Loss: 1.2698551416397095
Epoch 24, Loss: 1.2644299268722534
Epoch 25, Loss: 1.2294776439666748
Epoch 26, Loss: 1.2464245557785034
Epoch 27, Loss: 1.216057300567627
Epoch 28, Loss: 1.2118021249771118
Epoch 29, Loss: 1.2135412693023682
Ep

In [44]:
def evaluate(data):
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)
    return pred

# Evaluate the model
predictions = evaluate(data)
print(f'Predictions: {predictions}')


Predictions: tensor([1, 4, 1,  ..., 4, 2, 5])


In [45]:
# Assuming you have a LabelEncoder instance used earlier
predicted_labels = label_encoder.inverse_transform(predictions.numpy())
print(predicted_labels)


['Agents' 'IR' 'Agents' ... 'IR' 'DB' 'ML']


In [46]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(labels_encoded, predictions.numpy())
print(f'Accuracy: {accuracy}')

# Print classification report
print(classification_report(labels_encoded, predictions.numpy()))


Accuracy: 0.9746376811594203
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       249
           1       0.96      0.98      0.97       596
           2       0.99      0.98      0.98       701
           3       0.98      0.97      0.98       508
           4       0.97      0.97      0.97       668
           5       0.98      0.97      0.98       590

    accuracy                           0.97      3312
   macro avg       0.97      0.98      0.97      3312
weighted avg       0.97      0.97      0.97      3312



I got an accuracy of 97.4% using Adam optimizers and observed as increasing the no of epochs the accuracy increased rapidly from 54 to 94 when increased from 10 to 100