## Final Project ML Course  Start Experiment with Full data

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
tqdm.pandas()
import random
from itertools import islice
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
# Loading the CSV file to inspect its contents
file_path = '/lustre/acslab/users/3281/semmeddb_r42.csv'
df = pd.read_csv(file_path)

# Displaying the first few rows of the dataframe to understand its structure
df.head()

Unnamed: 0,date,pmid,sent_idx,ti_or_ab,pred_type,subj_ids,subj_names,subj_text,subj_type,obj_ids,obj_names,obj_text,obj_type
0,2020-05-26,32455440,3,ab,PROCESS_OF,C0005779,Blood Coagulation Disorders,Coagulopathy,dsyn,C0870668,hospitalized patients,Hospitalized Patients,humn
1,2020-05-19,32455215,11,ab,compared_with,C0063083,Hydrogel,hydrogel,orch,C0063083,Hydrogel,hydrogel,orch
2,2020-05-19,32455215,12,ab,INTERACTS_WITH,C0071703,polyvinyl alcohol hydrogel,PVA/BCP hydrogel,orch,C0162969,chitosan,CS,orch
3,2020-01-01,32455170,8,ab,PROCESS_OF,C5203670,COVID-19,COVID-19,dsyn,C0008059,Child,children,humn
4,2020-05-01,32455147,9,ab,PROCESS_OF,C5203670,COVID-19,coronavirus disease 2019,dsyn,C0030705,Patients,patients,humn


In [3]:
# dataframe.size
size = df.size
print("Size = {}".format(size))
count_row = df.shape[0]  # Gives number of rows
count_col = df.shape[1]  # Gives number of columns
print(count_col)
print(count_row)

Size = 1344486819
13
103422063


### Data PreProcessing to Drop Duplicate 
## Step 1: select only two column from .csv file into data frame (103M)

In [4]:
# Selecting only the 'subj_ids' and 'obj_ids' columns
edge_list = df[['subj_ids', 'obj_ids']]

# Checking the first few rows of the edge list
edge_list

Unnamed: 0,subj_ids,obj_ids
0,C0005779,C0870668
1,C0063083,C0063083
2,C0071703,C0162969
3,C5203670,C0008059
4,C5203670,C0030705
...,...,...
103422058,C0010674,C0027361
103422059,C5203670,C0027361
103422060,C0006826,C0030705
103422061,C0032105,C1999216


### 1.1 save .pkl file for edge_list with 2 column(103M) drop dupplicate
NOTE: the number of rows has reduced from 103M into 16M into 14M.

In [6]:
# pd.to_pickle(edge_list, "/lustre/acslab/users/3281/fullsemmeddb_matrix.pkl")
edge_list= pd.read_pickle("fullsemmeddb_matrix.pkl")
edge_list_reduced = edge_list.drop_duplicates()
edge_list_reduced

Unnamed: 0,subj_ids,obj_ids
0,C0005779,C0870668
1,C0063083,C0063083
2,C0071703,C0162969
3,C5203670,C0008059
4,C5203670,C0030705
...,...,...
103422025,C0537969,C0057445
103422027,C1412056|17,G0000145
103422029,C1412056|17,C0057445
103422030,C1148523,C0038952


In [8]:
# pd.to_pickle(edge_list_reduced,"fullsemmeddb_matrix_reduced_dropped.pkl")
edge_list_reduced_dropped= pd.read_pickle("fullsemmeddb_matrix_reduced_dropped.pkl")
edge_list_reduced_dropped

Unnamed: 0,subj_ids,obj_ids
0,C0005779,C0870668
1,C0063083,C0063083
2,C0071703,C0162969
3,C5203670,C0008059
4,C5203670,C0030705
...,...,...
103422025,C0537969,C0057445
103422027,C1412056|17,G0000145
103422029,C1412056|17,C0057445
103422030,C1148523,C0038952


### 1.3 drop duplicate and convert EdgeList into set (14M)

In [9]:
reduced_dropped_edge_set = set([tuple(sorted(x)) for x in tqdm(edge_list_reduced_dropped.to_numpy())])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16322866/16322866 [00:22<00:00, 714144.63it/s]


In [10]:
len(reduced_dropped_edge_set)

14661211

In [12]:
reduced_dropped_edge_set_list = list(reduced_dropped_edge_set)
print(reduced_dropped_edge_set_list[:5])

[('80184', 'C0162610'), ('C0004096', 'C0231184'), ('C0284778', 'C0597295'), ('C0025255', 'C0536622'), ('C0015967', 'C0059020')]


## step 2: Generate Node_list out of Edge_list:

In [45]:
nodes = set()

# Iterate through each edge and add the nodes to the set
for edge in list_from_set:
    nodes.update(edge)

# Convert the set of unique nodes to a list
node_list = list(nodes)

In [13]:
def generate_node_list(edge_list):
    # Using a set for efficiency as it will automatically handle duplicates
    node_set = set()
    for subj_id, obj_id in edge_list:
        node_set.add(subj_id)
        node_set.add(obj_id)
    return list(node_set)

In [15]:
node_list = generate_node_list(reduced_dropped_edge_set_list)

In [16]:
node_list_list=list(node_list)

In [17]:
len(node_list_list)

304525

In [18]:
node_list_list[:10]

['C0053269',
 'C0223616',
 'C0701237',
 'C0024834',
 'C1077098',
 'C0212295',
 'C0665238',
 'C1085199',
 'C0215917',
 'C0123496']

## Step 3: Negative Sampling (My own Impl)
Note: Number of Nodes: 304K

In [19]:
def negative_sampling(node_list, edge_set, num_samples):
    negative_samples = set()
    pbar = tqdm(total=num_samples)

    while len(negative_samples) < num_samples:
        # Randomly select two nodes
        node1, node2 = random.sample(node_list, 2)

        # Form a tuple (considering undirected graph, sort the tuple)
        edge = tuple(sorted((node1, node2)))

        # Check if this edge is not in the graph
        if edge not in edge_set:
            negative_samples.add(edge)
            pbar.update(1)

    pbar.close() 
    return negative_samples

In [110]:
tqdm._instances.clear() # reset library itself

In [23]:
# Number of positive samples (existing edges)
num_samples = 100000

# Generating negative samples
negative_samples = negative_sampling(node_list_list ,reduced_dropped_edge_set, num_samples)

# Checking the first few negative samples
negative_samples_list = list(negative_samples)
print(negative_samples_list[:5])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 220688.73it/s]

[('C0246044', 'C1082932'), ('C0060142', 'C1021073'), ('C0128631', 'C0667860'), ('C0153661', 'C0381193'), ('C0520974', 'C1452025')]





## Step4: Convert data frame to suitable data format for PyTorch Geometric
 4.1. Process data: Usually it is a list of edge tuples or an edge index tensor. 

 4.2. Create a Graph: In order to transform your tabular data into a format that PyTorch Geometric can work with.

In [25]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import to_undirected

def convert_to_pyg_format(node_list, edge_list):
    # Mapping from node IDs to consecutive integers
    node_id_to_index = {node_id: i for i, node_id in enumerate(node_list)}

    # Convert edge list to edge_index format
    edge_index = [[node_id_to_index[edge[0]], node_id_to_index[edge[1]]] for edge in edge_list]
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # Create PyTorch Geometric Data object
    undirected_edge_index = to_undirected(edge_index)
    data = Data(edge_index=undirected_edge_index)

    return data

# Convert your edge list to PyTorch Geometric format
pyg_data = convert_to_pyg_format(node_list_list, reduced_dropped_edge_set_list)  # Replace 'node_list' and 'edge_list' with your actual data
print(pyg_data.edge_index)
print(pyg_data.edge_index.shape)

tensor([[     0,      0,      0,  ..., 304524, 304524, 304524],
        [   335,   5920,  12284,  ..., 301898, 302816, 304524]])
torch.Size([2, 29304764])


In [52]:
# import GPUtil
# import pandas as pd
# import torch
# from torch_geometric.data import Data
# from torch_geometric.utils import to_undirected

# # node_list_list= 304525
# # reduced_dropped_edge_set = 14661211
# # PYG -> edge_index[2,14661211]
# # Create a mapping from node identifiers to indices based on the node list
# node_to_idx = {node: idx for idx, node in enumerate(node_list_list)}

# # Create edge index array
# source_nodes = []
# target_nodes = []
# for source, target in reduced_dropped_edge_set:
#     source_idx = node_to_idx.get(source)
#     target_idx = node_to_idx.get(target)
#     if source_idx is not None and target_idx is not None:
#         source_nodes.append(source_idx)
#         target_nodes.append(target_idx)

# edge_index = [source_nodes, target_nodes]

# # edge_index is a list of lists to a tensor
# edge_index_tensor = torch.tensor(edge_index, dtype=torch.long)
# print(edge_index_tensor.shape)
# undirected_edge_index = to_undirected(edge_index_tensor)

# # Creating the graph data object
# graph_data = Data(edge_index=undirected_edge_index)

# print(graph_data.edge_index)
# print(graph_data.edge_index.shape)

torch.Size([2, 14661211])
tensor([[     0,      0,      0,  ..., 304524, 304524, 304524],
        [  1700,   3935,   4440,  ..., 264752, 271693, 289986]])
torch.Size([2, 29304764])


In [26]:
pyg_data

Data(edge_index=[2, 29304764])

## Step5. Train Node2vec Model via Pytorch_geometric

In [28]:
from torch_geometric.nn import Node2Vec
import sys

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Example initialization of Node2Vec
model = Node2Vec(
    pyg_data.edge_index, 
    embedding_dim=128, 
    walk_length=20, 
    context_size=10, 
    walks_per_node=10
).to(device)

loader = model.loader(batch_size=128, shuffle=True, num_workers=10)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [29]:
model

Node2Vec(304525, 128)

In [30]:
device

device(type='cuda')

In [32]:
def train():
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(1, 10):
    loss = train()
    # print(f"VRAM Usage: {GPUtil.getGPUs()[0].memoryUsed / 1024} GB")
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Epoch: 001, Loss: 1.0287
Epoch: 002, Loss: 0.9588
Epoch: 003, Loss: 0.9546
Epoch: 004, Loss: 0.9522
Epoch: 005, Loss: 0.9522
Epoch: 006, Loss: 0.9522
Epoch: 007, Loss: 0.9520
Epoch: 008, Loss: 0.9519
Epoch: 009, Loss: 0.9518


In [33]:
node2vec_matrix = model().detach().cpu().numpy()

In [34]:
node2vec_matrix.shape

(304525, 128)

In [35]:
pd.to_pickle(node2vec_matrix, "/lustre/acslab/users/3281/finalOutputNode2vec.pkl")

In [36]:
last_df= pd.read_pickle("finalOutputNode2vec.pkl")
last_df

array([[ 0.21238802,  0.02722863,  0.00776974, ..., -0.09622791,
        -0.21466248,  0.3546657 ],
       [-0.14814717, -0.17625654,  0.12135533, ..., -0.10352099,
         0.04301796,  0.0055915 ],
       [-0.15631212, -0.14544198,  0.336963  , ...,  0.44662377,
        -0.5391678 , -0.34468433],
       ...,
       [ 0.04317998,  0.30955973, -0.02088119, ..., -0.49367446,
        -0.05465305,  0.1421927 ],
       [-0.03599167,  0.31727865,  0.31988922, ..., -0.21867618,
        -0.6076326 , -0.05904901],
       [ 0.03095075,  0.03636742,  0.65714175, ..., -0.1952335 ,
        -0.28922018, -0.47379526]], dtype=float32)

In [37]:
rows = last_df.shape[0]  
cols = last_df.shape[1]
print(rows)
print(cols)

304525
128


### 5.1 Construct a dict:
1. Construct a dict to make sure each node in the Node2vec matrix has a key as node id and a value as Embedding numeric value to it.

In [38]:
# Create a dictionary of node embeddings
node_embeddings = {node_id: last_df[idx] for idx, node_id in enumerate(node_list_list)}

In [39]:
n = 5  # Number of rows you want to print
for node, embedding in islice(node_embeddings.items(), n):
    print(node, embedding)

C0053269 [ 0.21238802  0.02722863  0.00776974  0.466039   -0.11250183  0.12953442
 -0.38497186 -0.11452956 -0.1948966   0.2200725   0.20205154 -0.21191932
 -0.321637   -0.15512273 -0.37812886  0.12310097 -0.01978174 -0.1865594
 -0.18570437  0.10294246  0.02592673  0.22009815  0.15213707 -0.16139553
  0.09968509 -0.06570107  0.14612779 -0.31390426 -0.02118848 -0.18083677
 -0.26682422 -0.14610083 -0.15425223  0.22996457  0.04095507  0.4701284
  0.07806695 -0.08702004  0.12824197  0.12546173  0.12858115  0.400321
 -0.05202904  0.30758208 -0.24869591 -0.04712734 -0.02708614 -0.35951552
  0.3705164  -0.23846485  0.23758046 -0.20976718  0.16982953 -0.0483625
 -0.10890674  0.24424765  0.05286938  0.5441014  -0.09961157 -0.03378319
  0.07518504 -0.07380659 -0.03279862  0.09896362 -0.09306476 -0.156255
  0.18193838 -0.12922424 -0.08696769 -0.14856674  0.5145642   0.40867805
 -0.34702814 -0.00406679  0.497165   -0.97311354  0.00650492 -0.05465551
  0.28371212 -0.2782382  -0.02558846  0.18125851 

In [41]:
print("First few positive samples:", list(reduced_dropped_edge_set)[:5])
print("First few negative samples:", list(negative_samples)[:5])

First few positive samples: [('80184', 'C0162610'), ('C0004096', 'C0231184'), ('C0284778', 'C0597295'), ('C0025255', 'C0536622'), ('C0015967', 'C0059020')]
First few negative samples: [('C0246044', 'C1082932'), ('C0060142', 'C1021073'), ('C0128631', 'C0667860'), ('C0153661', 'C0381193'), ('C0520974', 'C1452025')]


## Step6: run Hadamard fuction (element wise multiplication) to generate edge feature. 

In [42]:
# Function to calculate Hadamard product
def hadamard(v1, v2):
    return np.multiply(v1, v2)

# Function to generate edge features
def generate_edge_features(samples, node_embeddings):
    edge_features = []
    for node1, node2 in tqdm(samples):
        vector1 = node_embeddings.get(node1)
        vector2 = node_embeddings.get(node2)
        
        
        if vector1 is not None and vector2 is not None:
            edge_feature = hadamard(vector1, vector2)
            edge_features.append(edge_feature)
        else:
            # Handle missing vectors if necessary
            pass
    return edge_features

# Assuming you have lists of positive and negative edge samples

positive_edge_features = generate_edge_features(
    list(random.sample(reduced_dropped_edge_set, k=100000)), node_embeddings
)
negative_edge_features = generate_edge_features(negative_samples, node_embeddings)
print("Length positive samples:", len(positive_edge_features))
print("Length negative samples:", len(negative_samples))

since Python 3.9 and will be removed in a subsequent version.
  list(random.sample(reduced_dropped_edge_set, k=100000)), node_embeddings
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 238895.17it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 322975.16it/s]

Length positive samples: 100000
Length negative samples: 100000





### 6.1 Assign Label into your samples: 1 for positive samples and 0 for negative samples.
And combine the both negative and positive samples. 

In [43]:
# Assuming positive_edge_features and negative_edge_features are your feature lists

# Assign label 1 to positive samples
labeled_positive_samples = [(feature, 1) for feature in positive_edge_features]

# Assign label 0 to negative samples
labeled_negative_samples = [(feature, 0) for feature in negative_edge_features]

# Combine labeled samples into one dataset
labeled_samples = labeled_positive_samples + labeled_negative_samples

### 6.2 Convert labeled samples to a DataFrame

In [44]:
# Convert the labeled samples to a DataFrame
last_supper_df = pd.DataFrame(labeled_samples, columns=['Edge_Features', 'Label'])
last_supper_df

Unnamed: 0,Edge_Features,Label
0,"[0.0009842808, 0.00616581, -0.0021372624, -0.0...",1
1,"[0.036738425, 0.00509312, 0.47021523, -0.05719...",1
2,"[-0.020338194, 0.03644145, 0.89394176, -0.0518...",1
3,"[-0.0031132963, -0.045420375, -0.037223656, -0...",1
4,"[-0.0013758815, -0.07764459, -0.00069168455, 0...",1
...,...,...
199995,"[-0.04475953, 0.03884531, 0.06962857, 0.049980...",0
199996,"[-0.0160331, -0.047917705, -0.2279184, -0.0038...",0
199997,"[0.047991853, 0.019214293, -0.01874067, 0.0064...",0
199998,"[-0.086633354, -0.07048315, -0.036854967, -0.0...",0


### 6.3. Save tabular data as a .pkl file

In [45]:
last_supper_df['Edge_Features']

0         [0.0009842808, 0.00616581, -0.0021372624, -0.0...
1         [0.036738425, 0.00509312, 0.47021523, -0.05719...
2         [-0.020338194, 0.03644145, 0.89394176, -0.0518...
3         [-0.0031132963, -0.045420375, -0.037223656, -0...
4         [-0.0013758815, -0.07764459, -0.00069168455, 0...
                                ...                        
199995    [-0.04475953, 0.03884531, 0.06962857, 0.049980...
199996    [-0.0160331, -0.047917705, -0.2279184, -0.0038...
199997    [0.047991853, 0.019214293, -0.01874067, 0.0064...
199998    [-0.086633354, -0.07048315, -0.036854967, -0.0...
199999    [0.041274898, -0.013258298, -0.006710299, 0.01...
Name: Edge_Features, Length: 200000, dtype: object

In [46]:
matrix = np.array(last_supper_df['Edge_Features'].tolist())

In [47]:
pd.to_pickle(matrix, "/lustre/acslab/users/3281/tabulardata_x200000.pkl")

In [48]:
matrix.shape

(200000, 128)

In [49]:
last_supper_df['Label']

0         1
1         1
2         1
3         1
4         1
         ..
199995    0
199996    0
199997    0
199998    0
199999    0
Name: Label, Length: 200000, dtype: int64

In [50]:
matrix_y = np.array(last_supper_df['Label'].tolist())

In [51]:
matrix_y.shape

(200000,)

In [52]:
pd.to_pickle(matrix_y, "/lustre/acslab/users/3281/tabulardata_Y200000.pkl")

## Step7. SkLearn Logestic Regression

In [53]:
X_train, X_test, y_train, y_test = train_test_split(
     matrix, matrix_y, test_size=0.33, random_state=42)

In [54]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [55]:
clf.predict(X_test)

array([0, 1, 0, ..., 1, 0, 0])

In [56]:
roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

0.9772517975372892