# Startup Data class

In [1]:
import dgl
from dgl.data import DGLDataset
import torch
import os
import pandas as pd
import numpy as np
import dgl
device = torch.device('cpu')

In [58]:
class COMP4222Dataset(DGLDataset):
    def __init__(self):
        super().__init__(name='comp-4222')

    def process(self):
        self.df_startups = pd.read_csv('./data/startups_formatted.csv')
        self.df_investors = pd.read_csv('./data/investors_formatted.csv')
        self.df_investments = pd.read_csv('./data/funding_round_formatted.csv')
       
        # drop unlinked node
        self.df_startups = self.df_startups.drop(i for i in self.df_startups.id.values.tolist() if i not in self.df_investments.funded_object_id.values.tolist())
        self.df_startups = self.df_startups.reset_index()
        
        dictionary = dict(zip(np.unique(df_investments.funded_object_id.values),df_startups.index.values))
        self.df_investments['investor_object_id'] = self.df_investments['investor_object_id'] + len(self.df_startups)
        self.df_investments["funded_object_id"] = self.df_investments["funded_object_id"].replace(dictionary)
        
        self.df_investments = self.df_investments.groupby(['investor_object_id','funded_object_id']).sum()
        self.df_investments = self.df_investments.reset_index()
        self.investments_edge = len(self.df_investments)
        
        self.startup_node = len(self.df_investments)
        self.investor_node = len(self.df_investors)
        
        self.graph = dgl.graph((torch.tensor(self.df_investments.funded_object_id.values.tolist()), 
                                torch.tensor(self.df_investments.investor_object_id.values.tolist())))

        

    
        
        
        self.graph.ndata['feat'] = torch.concat((torch.tensor(self.df_startups.iloc[:, 3:].to_numpy()), 
                                                 torch.tensor(np.pad(self.df_investors.iloc[:, 2:].to_numpy(), 
                                                                     [(0,0),(0,120)], 
                                                                     mode='constant', constant_values=0))))
        # 0 for startup, 1 for investor
        self.graph.ndata['label'] = torch.concat((torch.zeros(len(self.df_startups)), 
                                                  torch.ones(len(self.df_investors))))

        edge_feature = [i for i in self.df_investments.columns if i not in ["funding_round_id", "funded_object_id", "investor_object_id"]]
        self.graph.edata['feat'] = torch.tensor(self.df_investments[edge_feature].to_numpy())
  

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

dataset = COMP4222Dataset()
g = dataset[0]
graph = g

In [91]:
u,v = g.edges()
print(g.num_edges())
print(len(u))
print(torch.max(v))
g

45621
45621
tensor(25445)


Graph(num_nodes=25446, num_edges=45621,
      ndata_schemes={'feat': Scheme(shape=(221,), dtype=torch.float64), 'label': Scheme(shape=(), dtype=torch.float32)}
      edata_schemes={'feat': Scheme(shape=(26,), dtype=torch.float64)})

In [93]:
import scipy.sparse as sp

u, v = graph.edges()
# give id for all edges then permutation
eids = np.arange(graph.number_of_edges())
eids = np.random.permutation(eids)

# use 10% as test set
test_size = int(len(eids) * 0.1)
train_size = graph.number_of_edges() - test_size

test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing

#use sparse matrix to save memory
# ,shape = (torch.max(v)+1,torch.max(v)+1)
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(torch.max(u)+1,torch.max(v)+1)
neg_u, neg_v = np.where(adj_neg != 0) # negative edge, we don't have edge

neg_eids = np.random.choice(len(neg_u), graph.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]