In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np

# Load the .content file
def load_content_file(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            split_line = line.strip().split()
            paper_id = split_line[0]
            attributes = list(map(int, split_line[1:-1]))
            class_label = split_line[-1]
            data.append([paper_id] + attributes + [class_label])

    # Create DataFrame
    df = pd.DataFrame(data)

    # Extract features and labels
    paper_ids = df.iloc[:, 0].values
    features = df.iloc[:, 1:-1].values
    labels = df.iloc[:, -1].values

    return df, paper_ids, features, labels

# Path to the .content file
content_file_path = '/content/drive/MyDrive/cora.content'
df_content, paper_ids, features, labels = load_content_file(content_file_path)

print("Content file loaded.")
print(df_content.head())


Content file loaded.
      0     1     2     3     4     5     6     7     8     9     ...  1425  \
0    31336     0     0     0     0     0     0     0     0     0  ...     0   
1  1061127     0     0     0     0     0     0     0     0     0  ...     0   
2  1106406     0     0     0     0     0     0     0     0     0  ...     0   
3    13195     0     0     0     0     0     0     0     0     0  ...     0   
4    37879     0     0     0     0     0     0     0     0     0  ...     0   

   1426  1427  1428  1429  1430  1431  1432  1433                    1434  
0     0     1     0     0     0     0     0     0         Neural_Networks  
1     1     0     0     0     0     0     0     0           Rule_Learning  
2     0     0     0     0     0     0     0     0  Reinforcement_Learning  
3     0     0     0     0     0     0     0     0  Reinforcement_Learning  
4     0     0     0     0     0     0     0     0   Probabilistic_Methods  

[5 rows x 1435 columns]


In [25]:
import pandas as pd

# Load content data using tab as the delimiter
content_file_path = '/content/drive/MyDrive/cora.content'
content_df = pd.read_csv(content_file_path, sep='\t', header=None)

# Print the first few rows to verify
print(content_df.head())
print(content_df.shape)

# Extract features and labels
features = content_df.iloc[:, 1:-1].values  # All columns except the first and last
labels = content_df.iloc[:, -1].values  # Last column

print("Feature matrix shape:", features.shape)
print("Sample features:", features[:5])
print("Labels shape:", labels.shape)

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
print("Encoded labels shape:", encoded_labels.shape)
print("Sample encoded labels:", encoded_labels[:5])

# Convert to tensors
import torch
features = torch.tensor(features, dtype=torch.float)
labels = torch.tensor(encoded_labels, dtype=torch.long)


      0     1     2     3     4     5     6     7     8     9     ...  1425  \
0    31336     0     0     0     0     0     0     0     0     0  ...     0   
1  1061127     0     0     0     0     0     0     0     0     0  ...     0   
2  1106406     0     0     0     0     0     0     0     0     0  ...     0   
3    13195     0     0     0     0     0     0     0     0     0  ...     0   
4    37879     0     0     0     0     0     0     0     0     0  ...     0   

   1426  1427  1428  1429  1430  1431  1432  1433                    1434  
0     0     1     0     0     0     0     0     0         Neural_Networks  
1     1     0     0     0     0     0     0     0           Rule_Learning  
2     0     0     0     0     0     0     0     0  Reinforcement_Learning  
3     0     0     0     0     0     0     0     0  Reinforcement_Learning  
4     0     0     0     0     0     0     0     0   Probabilistic_Methods  

[5 rows x 1435 columns]
(2708, 1435)
Feature matrix shape: (2708, 14

In [26]:
import pandas as pd

# Load paper IDs from the .content file
def load_paper_ids_from_content(file_path):
    paper_ids = set()
    with open(file_path, 'r') as file:
        for line in file:
            paper_id = line.strip().split()[0]
            paper_ids.add(paper_id)
    return paper_ids

# Load paper IDs from the .cites file
def load_paper_ids_from_cites(file_path):
    paper_ids = set()
    with open(file_path, 'r') as file:
        for line in file:
            cited_id, citing_id = line.strip().split()
            paper_ids.add(cited_id)
            paper_ids.add(citing_id)
    return paper_ids

# Paths to the .content and .cites files
content_file_path = '/content/drive/MyDrive/cora.content'
cites_file_path = '/content/drive/MyDrive/cora.cites'

# Extract paper IDs
content_paper_ids = load_paper_ids_from_content(content_file_path)
cites_paper_ids = load_paper_ids_from_cites(cites_file_path)

# Check if all paper IDs in content are in cites
missing_in_cites = content_paper_ids - cites_paper_ids
missing_in_content = cites_paper_ids - content_paper_ids

if not missing_in_cites and not missing_in_content:
    print("All paper IDs match between .content and .cites files.")
else:
    if missing_in_cites:
        print(f"Paper IDs in .content but not in .cites: {missing_in_cites}")
    if missing_in_content:
        print(f"Paper IDs in .cites but not in .content: {missing_in_content}")


All paper IDs match between .content and .cites files.


In [33]:
# Check for valid IDs in the citation data
cites_paper_ids = set(edge_index_np.flatten())
content_paper_ids = set(content_df[0].astype(str))

# Print differences
missing_in_content = cites_paper_ids - content_paper_ids
missing_in_cites = content_paper_ids - cites_paper_ids

print("IDs in citation data but not in content:", missing_in_content)
print("IDs in content data but not in citation:", missing_in_cites)


IDs in citation data but not in content: {1122304, 1155073, 851968, 1114118, 1105932, 1114125, 368657, 253971, 8213, 696342, 696343, 696345, 696346, 8224, 950305, 630817, 35, 262178, 40, 1114153, 1130539, 385067, 131117, 131122, 16437, 188471, 16451, 1138755, 1130567, 1114184, 1130568, 16461, 180301, 1114192, 16470, 16471, 155736, 16474, 155738, 1130586, 16476, 106590, 573535, 589923, 16485, 32872, 1130600, 630890, 1114222, 573553, 65650, 114, 65653, 221302, 117, 270456, 1122425, 409725, 1114239, 128, 130, 1106052, 1130634, 1130637, 647315, 180373, 1122460, 1130653, 1130657, 164, 82087, 82090, 180399, 82098, 1130676, 1130678, 1106103, 1130680, 1106112, 245955, 590022, 1114331, 1114336, 385251, 631015, 688361, 647408, 1114352, 131315, 73972, 33013, 647413, 131318, 131317, 1106172, 1114364, 213246, 41216, 229635, 270600, 631052, 1122574, 606479, 1114388, 1122580, 114966, 647447, 1138968, 1138970, 139547, 1130780, 1114398, 213279, 288, 1130808, 1106236, 672064, 1139009, 672070, 672071, 11

In [6]:
import networkx as nx

# Load the .cites file
def load_cites_file(file_path):
    edges = []
    with open(file_path, 'r') as file:
        for line in file:
            cited_paper, citing_paper = line.strip().split()
            edges.append((citing_paper, cited_paper))  # Note: edge direction is from citing to cited

    # Create a graph
    G = nx.DiGraph()
    G.add_edges_from(edges)

    return G

# Path to the .cites file
cites_file_path = '/content/drive/MyDrive/cora.cites'
graph = load_cites_file(cites_file_path)

print("Cites file loaded.")
print(f"Number of nodes: {len(graph.nodes)}")
print(f"Number of edges: {len(graph.edges)}")


Cites file loaded.
Number of nodes: 2708
Number of edges: 5429


In [38]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data

# Load the content data
content_df = pd.read_csv('/content/drive/MyDrive/cora.content', sep='\t', header=None)
content_df.columns = ['paper_id'] + [f'word_{i}' for i in range(content_df.shape[1] - 2)] + ['class_label']

# Convert paper IDs to strings
content_df['paper_id'] = content_df['paper_id'].astype(str)

# Create mapping from paper ID to index
id_to_index = {id_: idx for idx, id_ in enumerate(content_df['paper_id'].unique())}
index_to_id = {idx: id_ for id_, idx in id_to_index.items()}

# Load the citation data
citation_df = pd.read_csv('/content/drive/MyDrive/cora.cites', sep='\t', header=None)
citation_df.columns = ['cited_paper', 'citing_paper']

# Convert paper IDs to strings
citation_df['cited_paper'] = citation_df['cited_paper'].astype(str)
citation_df['citing_paper'] = citation_df['citing_paper'].astype(str)

# Filter and update edge indices
valid_edges = []
for _, row in citation_df.iterrows():
    if row['cited_paper'] in id_to_index and row['citing_paper'] in id_to_index:
        valid_edges.append([id_to_index[row['cited_paper']], id_to_index[row['citing_paper']]])

# Convert valid edges to tensor
edge_index = torch.tensor(np.array(valid_edges).T, dtype=torch.long)

# Filter content data based on valid paper IDs
valid_content_df = content_df[content_df['paper_id'].isin(id_to_index.keys())]

# Define label encoding mapping
unique_labels = valid_content_df['class_label'].unique()
label_encoding = {label: idx for idx, label in enumerate(unique_labels)}

# Update features and labels based on valid content data
features = torch.tensor(valid_content_df.iloc[:, 1:-1].values, dtype=torch.float)
labels = torch.tensor(valid_content_df['class_label'].map(label_encoding).values, dtype=torch.long)

# Create Data object
data = Data(x=features, edge_index=edge_index, y=labels)

print("Updated Data object created.")
print("Features shape:", data.x.shape)
print("Edge index shape:", data.edge_index.shape)
print("Labels shape:", data.y.shape)


Updated Data object created.
Features shape: torch.Size([2708, 1433])
Edge index shape: torch.Size([2, 5429])
Labels shape: torch.Size([2708])


In [14]:
!pip install torch torchvision torchaudio




In [15]:
!pip install torch-geometric


Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/64.2 kB[0m [31m619.2 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m51.2/64.2 kB[0m [31m1.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m683.1 kB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.5.3


It is observed that on changing the learning rate from .005 to .001 the accuracy increased by 1% however increasing the no of epochs from 200 to 400 didnt bring much change

The accuracy increased by 1% on changing the dropout rate from  
self.conv1 = GATConv(num_features, 8,

*   self.conv1 = GATConv(num_features, 8,
heads=8, dropout=0.6)

*   self.conv2 = GATConv(8 * 8, num_classes, dropout=0.6)

 to


*   self.conv1 = GATConv(num_features, 8, heads=8, dropout=0.5)

*   self.conv2 = GATConv(8 * 8, num_classes, dropout=0.5)



Increasing the no of layers from 2 to 3 increased the accuracy from 94.8 to 97.5 but on adding the dropout rate it dropped back to 94%

When i added 4 layers the accuracy became 99.4% and increasing the layer to 6 made the accuracy 99.45%

In [58]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv

class GAT(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(GAT, self).__init__()
        #self.conv1 = GATConv(num_features, 32, heads=8)
        #self.conv2 = GATConv(32 * 8, 64, heads=8)
        #self.conv3 = GATConv(64 * 8, 128, heads=8)
        #self.conv4 = GATConv(128 * 8, num_classes)
        self.conv1 = GATConv(num_features, 64, heads=8)
        self.conv2 = GATConv(64 * 8, 128, heads=8)
        self.conv3 = GATConv(128 * 8, 256, heads=8)
        self.conv4 = GATConv(256 * 8, 512, heads=8)
        self.conv5 = GATConv(512 * 8, 256, heads=8)
        self.conv6 = GATConv(256 * 8, num_classes)


    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Create the model
model = GAT(num_features=1433, num_classes=len(label_encoding))

# Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out, data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

# Run training for a few epochs
for epoch in range(400):
    loss = train()
    print(f'Epoch {epoch+1}, Loss: {loss:.4f}')


Epoch 1, Loss: 6.9346
Epoch 2, Loss: 5.3077
Epoch 3, Loss: 2.7566
Epoch 4, Loss: 1.4325
Epoch 5, Loss: 1.0127
Epoch 6, Loss: 1.0762
Epoch 7, Loss: 0.9556
Epoch 8, Loss: 0.8665
Epoch 9, Loss: 0.8848
Epoch 10, Loss: 0.8068
Epoch 11, Loss: 0.8595
Epoch 12, Loss: 0.7034
Epoch 13, Loss: 0.7438
Epoch 14, Loss: 0.8356
Epoch 15, Loss: 0.6904
Epoch 16, Loss: 0.6695
Epoch 17, Loss: 0.6552
Epoch 18, Loss: 0.6947
Epoch 19, Loss: 0.6485
Epoch 20, Loss: 0.7018
Epoch 21, Loss: 0.6478
Epoch 22, Loss: 0.6197
Epoch 23, Loss: 0.5876
Epoch 24, Loss: 0.6420
Epoch 25, Loss: 0.6652
Epoch 26, Loss: 0.5773
Epoch 27, Loss: 0.5889
Epoch 28, Loss: 0.5628
Epoch 29, Loss: 0.5641
Epoch 30, Loss: 0.5233
Epoch 31, Loss: 0.5088
Epoch 32, Loss: 0.5457
Epoch 33, Loss: 0.5352
Epoch 34, Loss: 0.5573
Epoch 35, Loss: 0.4985
Epoch 36, Loss: 0.5165
Epoch 37, Loss: 0.5627
Epoch 38, Loss: 0.5638
Epoch 39, Loss: 0.5169
Epoch 40, Loss: 0.4920
Epoch 41, Loss: 0.5457
Epoch 42, Loss: 0.4972
Epoch 43, Loss: 0.5093
Epoch 44, Loss: 0.49

In [59]:
def evaluate(model, data):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        # Forward pass
        out = model(data.x, data.edge_index)
        # Get predicted labels by taking the index with the highest score
        pred = out.max(dim=1)[1]
        # Calculate accuracy
        correct = pred.eq(data.y).sum().item()
        accuracy = correct / len(data.y)
    return accuracy
# Evaluate the model
accuracy = evaluate(model, data)
print(f'Accuracy: {accuracy:.4f}')


Accuracy: 0.9945
