In [7]:
import os
import torch
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T

import matplotlib.pyplot as plt

# name_data = 'Cora'
# dataset = Planetoid(root= '/tmp/' + name_data, name = name_data)
# dataset.transform = T.NormalizeFeatures()

# print(f"Number of Classes in {name_data}:", dataset.num_classes)
# print(f"Number of Node Features in {name_data}:", dataset.num_node_features)

In [10]:
import gzip,os
import pandas as pd
import torch
from torch_geometric.data import Data, Dataset, DataLoader

from torch_geometric.utils import one_hot

def read_gzipped_csv(file_path):
    #include the header
    df = pd.read_csv(gzip.open(file_path), header=None)
    return df


class MyDatasetTrain(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(MyDatasetTrain, self).__init__(root, transform, pre_transform)
        dataset_path = os.path.join(root)

        # Load your dataset files here
        self.graph_labels = read_gzipped_csv(dataset_path +  '/graph_labels.csv.gz')
        self.num_nodes = read_gzipped_csv(dataset_path +  '/num_nodes.csv.gz')
        self.num_edges = read_gzipped_csv(dataset_path + '/num_edges.csv.gz')
        self.node_features = read_gzipped_csv(dataset_path +  '/node_features.csv.gz')
        self.edges = read_gzipped_csv(dataset_path +  '/edges.csv.gz')
        self.edge_features = read_gzipped_csv(dataset_path +  '/edge_features.csv.gz')
        #define a dataframe which stroes the start of node_features for each graph
        self.node_features_start = pd.DataFrame(columns=['start'])
        self.node_features_start.loc[0] = 0
        for i in range(1, len(self.num_nodes)):
            self.node_features_start.loc[i] = self.node_features_start.loc[i-1] + self.num_nodes.iloc[i-1, 0]
        #define a dataframe which stroes the start of edge_features for each graph
        self.edge_features_start = pd.DataFrame(columns=['start'])
        self.edge_features_start.loc[0] = 0
        for i in range(1, len(self.num_edges)):
            self.edge_features_start.loc[i] = self.edge_features_start.loc[i-1] + self.num_edges.iloc[i-1, 0]

        last_index = len(self.graph_labels) 


        valid_indices = ~self.graph_labels.iloc[:, 0].isna()
        self.graph_labels = self.graph_labels[valid_indices]
        self.num_nodes = self.num_nodes[valid_indices]
        self.num_edges = self.num_edges[valid_indices]
        self.node_features_start = self.node_features_start[valid_indices]
        self.edge_features_start = self.edge_features_start[valid_indices]

        count0 = self.graph_labels[self.graph_labels[0] == 0].count().iloc[0]
        count1 = self.graph_labels[self.graph_labels[0] == 1].count().iloc[0]
        minclass = 0
        mincount = count0
        if count0 > count1:
            minclass = 1
            mincount = count1
        majcount = max(count0, count1)
        #oversample the minority class
        cur_ind = 0
        while mincount < majcount:
            if self.graph_labels.iloc[cur_ind, 0] == minclass:
                self.graph_labels.loc[last_index] = self.graph_labels.iloc[cur_ind]
                self.num_nodes.loc[last_index] = self.num_nodes.iloc[cur_ind]
                self.num_edges.loc[last_index] = self.num_edges.iloc[cur_ind]
                self.node_features_start.loc[last_index] = self.node_features_start.iloc[cur_ind]
                self.edge_features_start.loc[last_index] = self.edge_features_start.iloc[cur_ind]
                mincount += 1
                last_index += 1
            cur_ind += 1

    def len(self):
        return len(self.graph_labels)

    def get(self, idx):
        label = self.graph_labels.iloc[idx, 0]
        label = torch.tensor(self.graph_labels.iloc[idx, 0], dtype=torch.float32).round().long()
        #make label one dimensional
        label_shape = [1]
        labelo = torch.zeros(label_shape, dtype=torch.long)
        labelo[0] = label
        label = labelo


        num_nodes = self.num_nodes.iloc[idx, 0]
        num_edges = self.num_edges.iloc[idx, 0]

        # Extract node features for the current graph
        start_node_features = self.node_features_start.iloc[idx, 0]
        end_node_features = start_node_features + num_nodes
        #datatype of node_features is numpy.ndarray
        node_features = torch.tensor(self.node_features.iloc[start_node_features:end_node_features, :].values, dtype=torch.float32)

        # Extract edge features for the current graph
        start_edge_features = self.edge_features_start.iloc[idx, 0]
        end_edge_features = start_edge_features + num_edges
        edge_features = torch.tensor(self.edge_features.iloc[start_edge_features:end_edge_features, :].values, dtype=torch.float32)

        # Extract edges for the current graph
        edges = torch.tensor(self.edges.iloc[start_edge_features:end_edge_features, :].values, dtype=torch.long)

        # Construct the graph
        graph = Data(x=node_features, edge_index=edges.t().contiguous(), edge_attr=edge_features, y=label)
        return graph

In [11]:
import gzip,os
import pandas as pd
import torch
from torch_geometric.data import Data, Dataset, DataLoader

from torch_geometric.utils import one_hot

def read_gzipped_csv(file_path):
    #include the header
    df = pd.read_csv(gzip.open(file_path), header=None)
    return df


class MyDatasetTest(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(MyDatasetTest, self).__init__(root, transform, pre_transform)
        dataset_path = os.path.join(root)

        # Load your dataset files here
        self.graph_labels = read_gzipped_csv(dataset_path +  '/graph_labels.csv.gz')
        self.num_nodes = read_gzipped_csv(dataset_path +  '/num_nodes.csv.gz')
        self.num_edges = read_gzipped_csv(dataset_path + '/num_edges.csv.gz')
        self.node_features = read_gzipped_csv(dataset_path +  '/node_features.csv.gz')
        self.edges = read_gzipped_csv(dataset_path +  '/edges.csv.gz')
        self.edge_features = read_gzipped_csv(dataset_path +  '/edge_features.csv.gz')
        #define a dataframe which stroes the start of node_features for each graph
        self.node_features_start = pd.DataFrame(columns=['start'])
        self.node_features_start.loc[0] = 0
        for i in range(1, len(self.num_nodes)):
            self.node_features_start.loc[i] = self.node_features_start.loc[i-1] + self.num_nodes.iloc[i-1, 0]
        #define a dataframe which stroes the start of edge_features for each graph
        self.edge_features_start = pd.DataFrame(columns=['start'])
        self.edge_features_start.loc[0] = 0
        for i in range(1, len(self.num_edges)):
            self.edge_features_start.loc[i] = self.edge_features_start.loc[i-1] + self.num_edges.iloc[i-1, 0]

        valid_indices = ~self.graph_labels.iloc[:, 0].isna()
        self.graph_labels = self.graph_labels[valid_indices]
        self.num_nodes = self.num_nodes[valid_indices]
        self.num_edges = self.num_edges[valid_indices]
        self.node_features_start = self.node_features_start[valid_indices]
        self.edge_features_start = self.edge_features_start[valid_indices]


    def len(self):
        return len(self.graph_labels)

    def get(self, idx):
        label = self.graph_labels.iloc[idx, 0]
        label = torch.tensor(self.graph_labels.iloc[idx, 0], dtype=torch.float32).round().long()
        #make label one dimensional
        label_shape = [1]
        labelo = torch.zeros(label_shape, dtype=torch.long)
        labelo[0] = label
        label = labelo

        #encode the label into one-hot vector

        num_nodes = self.num_nodes.iloc[idx, 0]
        num_edges = self.num_edges.iloc[idx, 0]

        # Extract node features for the current graph
        start_node_features = self.node_features_start.iloc[idx, 0]
        end_node_features = start_node_features + num_nodes
        #datatype of node_features is numpy.ndarray
        node_features = torch.tensor(self.node_features.iloc[start_node_features:end_node_features, :].values, dtype=torch.float32)

        # Extract edge features for the current graph
        start_edge_features = self.edge_features_start.iloc[idx, 0]
        end_edge_features = start_edge_features + num_edges
        edge_features = torch.tensor(self.edge_features.iloc[start_edge_features:end_edge_features, :].values, dtype=torch.float32)

        # Extract edges for the current graph
        edges = torch.tensor(self.edges.iloc[start_edge_features:end_edge_features, :].values, dtype=torch.long)

        # Construct the graph
        graph = Data(x=node_features, edge_index=edges.t().contiguous(), edge_attr=edge_features, y=label)
        return graph

In [12]:
train_dataset_path = "/home/slowblow/sem7/col761/ass-git/A3/dataset/dataset_2/train"
dataset = MyDatasetTrain(root=train_dataset_path)
dataset = dataset.shuffle()
dataset.transform = T.NormalizeFeatures()

In [14]:
class GAT(torch.nn.Module):
    def __init__(self):
        super(GAT, self).__init__()
        self.hid = 8
        self.in_head = 8
        self.out_head = 1
        
        
        self.conv1 = GATConv(dataset.num_features, self.hid, heads=self.in_head, dropout=0.6)
        self.conv2 = GATConv(self.hid*self.in_head, dataset.num_classes, concat=False,
                             heads=self.out_head, dropout=0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
                
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)
    
    
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = "cpu"

model = GAT().to(device)
data = dataset[0].to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

model.train()
for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out, data.y)
    
    if epoch%200 == 0:
        print(loss)
    
    loss.backward()
    optimizer.step()

ValueError: Expected input batch_size (15) to match target batch_size (1).

In [46]:
for i in range(len(dataset)):
    data = dataset[i]
    print(data.edge_index)
    break

print(dataset.num_classes)


tensor([[0, 1, 1, 2, 1, 3, 3, 4, 4, 5, 5, 6, 6, 7, 6, 8, 8, 9, 9, 3],
        [1, 0, 2, 1, 3, 1, 4, 3, 5, 4, 6, 5, 7, 6, 8, 6, 9, 8, 3, 9]])
2


In [49]:
task = 'graph'
model = train(dataset, task, writer)



Epoch 0. Loss: 0.6954. Test accuracy: 0.5102
Epoch 10. Loss: 0.6939. Test accuracy: 0.5102
Epoch 20. Loss: 0.6936. Test accuracy: 0.4898
Epoch 30. Loss: 0.6937. Test accuracy: 0.4898
Epoch 40. Loss: 0.6937. Test accuracy: 0.4898
Epoch 50. Loss: 0.6935. Test accuracy: 0.5102
Epoch 60. Loss: 0.6938. Test accuracy: 0.4898
Epoch 70. Loss: 0.6932. Test accuracy: 0.5102
Epoch 80. Loss: 0.6939. Test accuracy: 0.5102


KeyboardInterrupt: 

In [50]:
test_path = "/home/slowblow/sem7/col761/ass-git/A3/dataset/dataset_2/valid"
test_dataset =  MyDatasetTest(root=test_path)
count1 =0
count0 =0

for g in test_dataset:
    if g.y.item()==0:
        count0+=1
    else:
        count1+=1

print(count1, count0)

test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

test_acc = test(test_loader, model)
print(test_acc)

252 536




0.6802030456852792


0.6802030456852792


In [42]:
import pandas as pd

# Create a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 35],
        'City': ['New York', 'San Francisco', 'Los Angeles']}

df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Append a new row
new_row = {'Name': 'David', 'Age': 28, 'City': 'Seattle'}
df.loc[len(df)] = df.iloc[0]

# Display the DataFrame after appending
print("\nDataFrame after appending a new row:")
print(df)


Original DataFrame:
      Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
2  Charlie   35    Los Angeles

DataFrame after appending a new row:
      Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
2  Charlie   35    Los Angeles
3    Alice   25       New York
