In [None]:
!pip3 install fastapi pandas torch transformers uvicorn google-cloud-storage scikit-learn


In [32]:
import sys
sys.path.append("..")  # add parent directory to system path


from fastapi import FastAPI, UploadFile, HTTPException
import pandas as pd
import torch
import json
import io
from transformers import BartTokenizer

from model.model_loader import get_model



import logging

# Re-importing the necessary libraries
from transformers import BartTokenizer, BartForConditionalGeneration
import torch.nn as nn
import torch.nn.functional as F
import logging




In [68]:
def input_transform(input_data):
    """
    Takes JSON
    Transforms to a matrix
    Then to a tensor
    """
    torch.from_numpy(np.array(input_data))
    return torch.tensor(input_data)



def load_json_file(raw, filename):
    with open(raw + filename, 'r') as file:
        data = json.load(file)
    return data

def load_test_data(raw, input_embeddings_file, input_mask_invert_file, input_masks_file):
    input_embeddings_data = load_json_file(raw, input_embeddings_file)
    input_mask_invert_data = load_json_file(raw, input_mask_invert_file)
    input_masks_data = load_json_file(raw, input_masks_file)

    return [input_embeddings_data, input_mask_invert_data, input_masks_data]



def get_prediction(input_embeddings_batch, input_masks_batch, input_mask_invert_batch, placeholder_token,model,tokenizer,pred_tokens_list,pred_string_list):
    # Print shapes of the input tensors
    print(f"Shape of input_embeddings_batch: {input_embeddings_batch.shape}")
    print(f"Shape of input_masks_batch: {input_masks_batch.shape}")
    print(f"Shape of input_mask_invert_batch: {input_mask_invert_batch.shape}")
    

    with torch.no_grad():
            try:
                outputs = model(input_embeddings_batch, input_masks_batch, input_mask_invert_batch, placeholder_token["input_ids"])
                # Extract the generated token IDs from the model's outputs
                logits=outputs.logits
                probs = logits[0].softmax(dim = 1)
                values, predictions = probs.topk(1)
                predictions = torch.squeeze(predictions)
                predicted_string = tokenizer.decode(predictions).split('</s></s>')[0].replace('<s>','')
                predictions = predictions.tolist()
                truncated_prediction = []
                for t in predictions:
                    if t != tokenizer.eos_token_id:
                        truncated_prediction.append(t)
                    else:
                        break
                pred_tokens = tokenizer.convert_ids_to_tokens(truncated_prediction, skip_special_tokens = True)
                # print('predicted tokens:',pred_tokens)
                pred_tokens_list.append(pred_tokens)
                pred_string_list.append(predicted_string)
                print('predicted string:',predicted_string)
                # results.append(generated_text)
            except Exception as e:
                logging.error(f"Error during inference: {str(e)}")


In [70]:
# Load the new JSON files and inspect their structure
raw = "../datasets/saved_data/"
input_embeddings_file = "input_embeddings_0.json"
input_mask_invert_file = "input_mask_invert_0.json"
input_masks_file = "input_masks_0.json"

test_data = load_test_data(raw, input_embeddings_file, input_mask_invert_file, input_masks_file)


test_data


[[[[0.39298626349730237,
    0.12086981593986965,
    -0.03843609524660807,
    -0.5653143753859095,
    -0.5779393797547738,
    -1.0649407427816766,
    -0.7154430816461967,
    -0.5652602370605113,
    -0.6425181313486449,
    -0.7399872624407792,
    -0.7757705520003056,
    -1.0694772461169841,
    -0.4597537461139628,
    -0.8883851357078538,
    -0.9343408914124441,
    -0.7503046918243478,
    -0.5899013319071116,
    -0.8427748295631184,
    -0.5759298690676494,
    -0.2859644860645929,
    -0.3458462352442518,
    -0.34702090791428697,
    -0.5659435864067421,
    -0.6489271112916527,
    -0.7382513077419403,
    -0.493578552349827,
    -0.24887673602060445,
    -0.49675199909445145,
    -0.6745497715368961,
    -0.4845753526299706,
    -1.0915820723374337,
    -0.5069951447771585,
    -0.4747752534532199,
    -0.577918779116588,
    -0.16129154832325157,
    -0.4320062040200422,
    -0.5354541486904143,
    -0.44820046813204467,
    -0.4050719129024599,
    -0.16435743924972

In [71]:
tensor_test_data = []
for i in range(len(test_data)):
    tensor_test_data.append(input_transform(test_data[i]))
tensor_test_data

[tensor([[[ 0.3930,  0.1209, -0.0384,  ...,  1.2890,  1.0663, -1.4654],
          [ 0.3996,  0.5023,  0.4000,  ...,  0.7234,  0.8297, -1.5958],
          [ 0.0351, -0.2749, -0.3916,  ...,  0.5392,  0.6405, -1.4400],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]]),
 tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1.]]),
 tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0.]])]

In [72]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [73]:
input_embeddings_batch = tensor_test_data[0].to(device).float()
input_masks_batch = tensor_test_data[1].to(device)
input_mask_invert_batch = tensor_test_data[2].to(device)

# target_ids_batch = target_ids.to(device)


In [74]:
model=get_model()

results = []
pred_tokens_list = []
pred_string_list = []

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
placeholder_token = tokenizer("<s>", return_tensors="pt")

In [75]:
get_prediction(input_embeddings_batch, input_masks_batch, input_mask_invert_batch, placeholder_token,model,tokenizer,pred_tokens_list,pred_string_list)

Shape of input_embeddings_batch: torch.Size([1, 56, 840])
Shape of input_masks_batch: torch.Size([1, 56])
Shape of input_mask_invert_batch: torch.Size([1, 56])
predicted string: 


In [85]:
class DataProcessor:
    def __init__(self, base_path):
        self.base_path = base_path
    
    def load_json_file(self, filename):
        with open(self.base_path + filename, 'r') as file:
            data = json.load(file)
        return data
    
    def load_batch_data(self, batch_number):
        input_embeddings_file = f"input_embeddings_{batch_number}.json"
        input_mask_invert_file = f"input_mask_invert_{batch_number}.json"
        input_masks_file = f"input_masks_{batch_number}.json"
        
        input_embeddings_data = self.load_json_file(input_embeddings_file)
        input_mask_invert_data = self.load_json_file(input_mask_invert_file)
        input_masks_data = self.load_json_file(input_masks_file)

        return input_embeddings_data, input_mask_invert_data, input_masks_data
    
    def transform_to_tensor(self, data):
        return torch.tensor(data)

    
class ModelInference:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def get_prediction(self, input_embeddings_batch, input_masks_batch, input_mask_invert_batch, placeholder_token):
        pred_tokens_list = []
        pred_string_list = []
        try:
            outputs = self.model(input_embeddings_batch, input_masks_batch, input_mask_invert_batch, placeholder_token["input_ids"])
            logits = outputs.logits
            probs = logits[0].softmax(dim=1)
            values, predictions = probs.topk(1)
            predictions = torch.squeeze(predictions)
            predicted_string = self.tokenizer.decode(predictions).split('</s></s>')[0].replace('<s>', '')
            predictions = predictions.tolist()
            truncated_prediction = [t for t in predictions if t != self.tokenizer.eos_token_id]
            pred_tokens = self.tokenizer.convert_ids_to_tokens(truncated_prediction, skip_special_tokens=True)
            pred_tokens_list.append(pred_tokens)
            pred_string_list.append(predicted_string)
        except Exception as e:
            print(f"Error during inference: {str(e)}")
        
        return pred_tokens_list, pred_string_list

# Main loop for processing the first 50 batches
def process_batches(base_path, num_batches=50):
    data_processor = DataProcessor(base_path)
    model = get_model()
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
    model_inference = ModelInference(model, tokenizer)
    placeholder_token = tokenizer("<s>", return_tensors="pt")
    
    all_pred_tokens = []
    all_pred_strings = []

    for i in range(num_batches):
        input_embeddings_data, input_mask_invert_data, input_masks_data = data_processor.load_batch_data(i)
        
        input_embeddings_tensor = data_processor.transform_to_tensor(input_embeddings_data).float()
        input_masks_tensor = data_processor.transform_to_tensor(input_mask_invert_data)
        input_mask_invert_tensor = data_processor.transform_to_tensor(input_masks_data)

        input_embeddings_batch = input_embeddings_tensor.to(device).float()
        input_masks_batch = input_masks_tensor.to(device)
        input_mask_invert_batch = input_mask_invert_tensor.to(device)

        target_ids_batch = placeholder_token.to(device)
                                
        pred_tokens, pred_strings = model_inference.get_prediction(
            input_embeddings_tensor, 
            input_masks_tensor, 
            input_mask_invert_tensor, 
            target_ids_batch
        )
        all_pred_tokens.extend(pred_tokens)
        all_pred_strings.extend(pred_strings)
        
    return all_pred_tokens, all_pred_strings

# Run the process for the first 50 batches
base_path = "../datasets/saved_data/"
all_pred_tokens, all_pred_strings = process_batches(base_path)


In [86]:
all_pred_tokens

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

In [87]:
all_pred_strings

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

### Double tensor thing.

In [None]:
input_embeddings_double = input_embeddings.double()
input_mask_invert_double = input_mask_invert.double()
input_mask_double= input_masks.double()