<a href="https://colab.research.google.com/github/simpleParadox/Private-RE/blob/main/project_622.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers --quiet
!pip install pyvacy --quiet

[K     |████████████████████████████████| 5.3 MB 28.9 MB/s 
[K     |████████████████████████████████| 7.6 MB 48.8 MB/s 
[K     |████████████████████████████████| 163 kB 28.1 MB/s 
[?25h  Building wheel for pyvacy (setup.py) ... [?25l[?25hdone


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import torch.nn.functional as F
import torch.optim as optim

# Import the transformers library for the retrieving the BERT embeddings.
import transformers
from transformers import BertModel, BertTokenizer


# Import pyvacy for privacy preserving optimizers.
from pyvacy import optim as private_optim, analysis

# Import scikit-learn packages.
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.utils import gen_batches


# Import scientific computing python packages.
import pandas as pd
import numpy as np      
import matplotlib.pyplot as plt

# Additional packages.
from google.colab import drive
from tqdm import tqdm
import csv
from typing import List


# Using gpu if available.
device = "cuda:0" if torch.cuda.is_available() else "cpu"

## Read in Erin's tabular data and preprocess it.

In [9]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# relations_path = '/content/drive/MyDrive/CMPUT 622 project/data/tabular_data/Input_all_29_relation.tsv'
relations_path = '/content/drive/MyDrive/CMPUT 622 project/data/tabular_data/Input_500_29_relation.tsv'

train_data = pd.read_csv(relations_path, encoding='utf-8', sep = '\t')

print(train_data.shape[0])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
589617


In [None]:
train_data.isnull().sum()

table_caption          564370
table_section_title      8792
headers                   482
entity pair                 0
relation                    0
dtype: int64

In [None]:
train_data.head()

Unnamed: 0,table_caption,table_section_title,headers,entity pair,relation
0,Recipients,Recipients,Name of the recipient,Nishan-e-Haider Saif Ali Janjua Saif Ali Janj...,award.award_nominated_work.award_nominations.....
1,Recipients,Recipients,Name of the recipient,Nishan-e-Haider Raja Muhammad Sarwar Raja Muh...,award.award_nominated_work.award_nominations.....
2,Recipients,Recipients,Name of the recipient,Nishan-e-Haider Tufail Mohammad Tufail Mohammad,award.award_nominated_work.award_nominations.....
3,Recipients,Recipients,Name of the recipient,Nishan-e-Haider Raja Aziz Bhatti Raja Aziz Bh...,award.award_nominated_work.award_nominations.....
4,Recipients,Recipients,Name of the recipient,Nishan-e-Haider Rashid Minhas Rashid Minhas,award.award_nominated_work.award_nominations.....


In [None]:
train_data.fillna("", inplace = True)

# Shuffle data so that there is a higher chance of the train and test data being from the same distribution.
train_data = shuffle(train_data, random_state = 1)


labels = train_data.iloc[:,-1].values
sentences = train_data.iloc[:,:-1].values.tolist()

sentences = [' '.join(sent).strip() for sent in sentences]

label = preprocessing.LabelEncoder()
y = label.fit_transform(train_data['relation'])
label_mappings = integer_mapping = {i: l for i, l in enumerate(label.classes_)}

In [None]:
train_data[:2]

Unnamed: 0,table_caption,table_section_title,headers,entity pair,relation
580310,,Scorers and assistants,Position Name,DF Marko Lomić Marko Lomić,soccer.football_position.players
128294,,Short films,Title Genre,Bramadero Bramadero Erotic,film.film.genre


In [None]:
train_data.isnull().sum()

table_caption          0
table_section_title    0
headers                0
entity pair            0
relation               0
dtype: int64

## **Read Sententence-level Data**

In [3]:
train_directory_path = '/content/drive/MyDrive/CMPUT 622 project/data/semeval/train.txt'
test_directory_path = '/content/drive/MyDrive/CMPUT 622 project/data/semeval/test.txt'

In [4]:
relation_to_id = [
    "other", 
    "Entity-Destination(e1,e2)",
    "Cause-Effect(e2,e1)",        
    "Member-Collection(e2,e1)",      
    "Entity-Origin(e1,e2)",        
    "Message-Topic(e1,e2)",        
    "Component-Whole(e2,e1)",       
    "Component-Whole(e1,e2)",       
    "Instrument-Agency(e2,e1)",     
    "Product-Producer(e2,e1)",     
    "Content-Container(e1,e2)",     
    "Cause-Effect(e1,e2)",          
    "Product-Producer(e1,e2)",       
    "Content-Container(e2,e1)",    
    "Entity-Origin(e2,e1)",          
    "Message-Topic(e2,e1)",        
    "Instrument-Agency(e1,e2)",       
    "Member-Collection(e1,e2)",      
    "Entity-Destination(e2,e1)"]    

In [5]:
def convertText_csv(path):
  output: List[List[str]] = []

  with open(path) as file:
    lines = file.read()
    lines =  lines.splitlines()

  for line in lines:
    line = line.strip()
    input = line.split(sep="\t")
    entity1 = input[0]
    entity2 = input[1]
    relation = relation_to_id.index(input[2])
    sentence = input[3]

    sentence = sentence.replace('<e1>', '')
    sentence = sentence.replace('<e2>', '')
    sentence = sentence.replace('</e1>', '')
    sentence = sentence.replace('</e2>', '')
    
    output.append([sentence, entity1, entity2, relation])
  return output

In [6]:
def writeOutput(output, path):
  with open(path, 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerow(["sentence", "entity1", "entity2", "relation"])
    for i in output:
      writer.writerow(i)

In [10]:
def loadSemEvalDateset(inputFilename, outputFilename):
  writeOutput(convertText_csv(inputFilename), outputFilename)
  data = pd.read_csv(outputFilename, encoding='utf-8', sep = '\t')

  #data = shuffle(data, random_state = 1) 
  print(data[:1])

  labels = data.iloc[:,-1].values
  features = data.iloc[:,:-1].values.tolist()
  sentences = [' '.join(i).strip() for i in features]
  return sentences, labels

In [11]:
x_train, y_train = loadSemEvalDateset(train_directory_path, "train.tsv")

                                            sentence        entity1   entity2  \
0  The system as described above has its greatest...  configuration  elements   

   relation  
0         6  


In [12]:
x_train[0]

'The system as described above has its greatest application in an arrayed  configuration  of antenna  elements  . configuration elements'

In [13]:
x_test, y_test = loadSemEvalDateset(test_directory_path, "test.tsv")

                                            sentence entity1 entity2  relation
0  The most common  audits  were about  waste  an...  audits   waste         5


In [14]:
x_test[0] # probably test and train files have same data

'The most common  audits  were about  waste  and recycling . audits waste'

In [15]:
y_test.shape

(2717,)

In [16]:
y_train = torch.tensor(y_train)
y_train

tensor([6, 0, 8,  ..., 1, 0, 9])

## Initialize the pretrained BERT model (uncased) and the respective tokenizer.

### NOTE: We might need to tokenize and encode everything before running the model.

### Get BERT embeddings

In [None]:
def get_bert_embeds_from_tokens(bert_model, encoded_inputs):
    all_bert_embeds = []
    bert_model = bert_model.to(device)
    for i in tqdm(range(len(encoded_inputs))):
        encoded_input = encoded_inputs[i]
        encoded_input = encoded_input.to(device)
        # print("encoded input: ", type(encoded_input))
        outputs = bert_model(**encoded_input)
        # pooler_output = outputs['pooler_output']  # We don't need this for our analysis.
        hidden_states = outputs['last_hidden_state']
        all_bert_embeds.append(np.squeeze(hidden_states.cpu().detach().numpy()))
    return all_bert_embeds

In [None]:
def bert_tokenize(texts, tokenizer):
    all_encoded_inputs = []
    # bert_model = bert_model.to(device)
    
    for i in tqdm(range(len(texts))):
        text = texts[i]
        encoded_input = tokenizer(text, return_tensors='pt', padding="max_length", max_length=50, truncation=True)
        all_encoded_inputs.append(encoded_input)
        
    return all_encoded_inputs

### Putting the tokenizer into a function.

In [None]:
def bert_tokenize_and_get_embeds(texts, bert_model, tokenizer):
    all_encoded_inputs = []
    bert_model = bert_model.to(device)
    
    for i in tqdm(range(len(texts))):
        text = texts[i]
        encoded_input = tokenizer(text, return_tensors='pt', padding="max_length", max_length=50, truncation=True).to(device)
        outputs = bert_model(**encoded_input)
        hidden_states = outputs['last_hidden_state']
        all_encoded_inputs.append(np.squeeze(hidden_states.cpu().detach().numpy()))
        
    return all_encoded_inputs

### Define the BertTokenizer and the BertModel from the transformers library.

In [None]:
# Define the BertModel and the BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', model_max_length=50, padding_side='right')
bert_model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Encode the inputs and store them so that we don't have re-encode everytime we run the model.

In [None]:
# First get the train test splits on the sentences and the labels.
seeds = [0]
all_train_last_hidden_states = []
all_test_last_hidden_states = []
# NOTE: Since colab is running out of memory, you can process this in batches and then concatenate the results. See if this works. If not, then move to Compute Canada.
for seed in seeds:
    X_train_texts, X_test_texts, y_train_classes, y_test_classes = train_test_split(sentences, y, random_state=seed, test_size=0.2)

    # slices = gen_batches(len(X_train_texts), 1000)
    # for batch_num, s in enumerate(slices):
        # print("Batch num: ", batch_num)

        # Now do the tokenization and the encoding process.
    train_tokens = bert_tokenize(X_train_texts, bert_tokenizer)

    # test_tokens = bert_tokenize(X_test_texts, bert_tokenizer)

    # # Now get the encodings from BERT. NOTE: The get_bert_embeds_from_tokens function only returns the last_hidden_state vector for the input.
    last_hidden_states_train = get_bert_embeds_from_tokens(bert_model, train_tokens)
    # last_hidden_states_test = get_bert_embeds_from_tokens(bert_model, test_tokens)

    # # Store the hidden states
    # all_train_last_hidden_states.append(last_hidden_states_train)
    # all_test_last_hidden_states.append(last_hidden_states_test)

    all_train_last_hidden_states.append(train_tokens)

    del train_tokens
    # del last_hidden_states_train

np.savez_compressed(f"train_embeds_seed_{seed}.npz", all_train_last_hidden_states)
# np.savez_compressed(f"test_embeds_seed_{seed}.npz", all_test_last_hidden_states)

100%|██████████| 471693/471693 [05:19<00:00, 1475.02it/s]
 11%|█         | 52793/471693 [27:33<4669:47:01, 40.13s/it]

In [None]:
while True:
    print("hello world")

In [None]:
data = np.load("/content/test_embeds_seed_0.npz", allow_pickle=True)['arr_0'][0]

In [None]:
data.shape

(2, 50, 768)

## Model definition and training




### Implement the model

In [None]:
class erin_model(nn.Module):
    def __init__(self, input_size: list):
        super(erin_model,self).__init__()

        # Just add one LSTM unit as the model followed by a fully connected layer and then a softmax.

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=1, num_layers=1)
        self.linear = nn.Linear()

    def forward(self, x):
        x = self.lstm(x)
        x = x.view(x.size(0), -1)
        x = self.linear(x)
        output = F.softmax(x)
        return output

### Convert numpy to PyTorch TensorDataset and then into DataLoader

In [None]:
# Do this separetely inside for training and test data (both features and labels)
# Confirm if you need to convert the discrete values into one hot vectors or not. I think we need to but double check it to make sure.
batch_size = 16
tensor_x_train = torch.Tensor(<supply x_data_train)
tensor_y_train = torch.Tensor(<supply y_data_train>)

tensor_x_test = torch.Tensor(<supply x_data_test)
tensor_y_test = torch.Tensor(<supply y_data_test>)

train_dataset = TensorDataset(tensor_x_train, tensor_y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

test_dataset = TensorDataset(tensor_x_test, tensor_y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

### Implement training loop

In [None]:
model = erin_model(...)  # Enter input size
model.to(device)
optimizer = optim.DPSGD(params=model.parameters(), **training_parameters)  # Define training parameters.

epsilon = analysis.epsilon(**training_parameters)
loss_function = nn.CrossEntropyLoss()

epochs = 10
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)  # Put the data on the gpu if available.

        # Reset the optimizer to have zero-gradients.
        optimizer.zero_grad()

        # Forward pass.
        outputs = model(inputs)
        outputs.to(device)
        
        # Calculate loss.
        loss = criterion(outputs, labels)
        
        # Calculate gradients.
        loss.backward()

        # Update weights.
        optimizer.step()


        # Calculate loss for debugging.
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0
        


### Evaluating model performance on test data.

In [None]:
correct = 0
total = 0
all_labels = []
all_prediction_indices = []
total = torch.no_grad():
for data in test_loader:
    inputs, labels = data
    inputs, labels = inputs.to(device), labels.to(device)
    outputs = net(inputs)
    _, predicted = torch.max(outputs.data, 1)
    all_prediction_indices.append(predicted.item())
    total += labels.size(0)  # This is the batch_size
    all_labels.append(label.item())
    correct += (predicted == labels).sum().item()
    all_labels = predict
print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')

### Calculate F1 of the results

In [None]:
# Use all_labels and all_predictions from the previous cell.