<a href="https://colab.research.google.com/github/rogermasclans/measuring_the_commercial_potential_of_science/blob/main/science_compot_montecarlo_dropout.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Measuring The Commercial Potential of Science
## Masclans, R., Hasan, S., and Cohen, W. 2024
This notebook contains the code to train a classifier ....


## Preambles

In [2]:
# Install transformers setup
!pip install -q -U watermark
!pip install -qq transformers # installs transformers library
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModel #Automodel, Autotokenizer for SciBert
import torch
from torch import nn, optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
import json
import os
import numpy as np
import pandas as pd
from pylab import rcParams
from matplotlib import rc
from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils import shuffle
from collections import defaultdict
from textwrap import wrap
from google.cloud import storage
import subprocess
import time
from datetime import datetime
import pytz
et_timezone = pytz.timezone('US/Eastern')
import sys



Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.34.0

numpy       : 1.26.4
pandas      : 2.1.4
torch       : 2.3.1+cu121
transformers: 4.42.4



## Load training data

In [3]:
##  FOR LOADING DATA
from google.colab import auth
auth.authenticate_user()
print('Authenticated')
from google.cloud import bigquery


Authenticated


In [4]:
# LOAD DATA

# Set model's year cutt-of and sample size (sample size by class)
sample_size = 1000 # by class 10000
prediction_year = 2003
data_cuttof_year = prediction_year - 1
training_cuttoff_year = prediction_year - 5
data_starting_year = data_cuttof_year - 14 # 10 year period, we drop the first four to allow for pat renewals
column_ren_pat_cites_year = f'ren_pat_cites_{prediction_year-1}'
column_pat_cites_year = f'pat_cites_{prediction_year-1}'




# Set GCP project and BigQuery table details
project_id = 'comsci-353300'
dataset_id = 'derived'
table_id = 'training'

# Authenticate to Google Cloud
client = bigquery.Client(project=project_id)

# Set allow_large_results to True
job_config = bigquery.QueryJobConfig()
job_config.allow_large_results = True

# Construct the SQL query to fetch data from BigQuery
query = f'SELECT pub_id, pub_year, pat_cites, pat_cites_{prediction_year-1}, ren_pat_cites_{prediction_year-1}, ren_pat_cites FROM `{project_id}.{dataset_id}.{table_id}` WHERE pub_year <= {data_cuttof_year} AND pub_year >= {data_starting_year} LIMIT 100000'

# Fetch data from BigQuery
query_job = client.query(query, job_config=job_config)
results = query_job.result()  # Waits for the query to complete

# Convert the result to a pandas DataFrame
df_base = results.to_dataframe()

# Print the first few rows of the DataFrame
# print(df_base)


# Replace NAs for 0s
df_base[column_ren_pat_cites_year] = df_base[column_ren_pat_cites_year].fillna(0)
df_base['ren_pat_cites'] = df_base['ren_pat_cites'].fillna(0)
df_base[column_pat_cites_year] = df_base[column_pat_cites_year].fillna(0)
df_base['pat_cites'] = df_base['pat_cites'].fillna(0)


# Copy imported data to new df for training
df = df_base


# Create sentiment variable
class_names = ['Low Compot', 'Hihg Compot']

def to_sentiment(rating):
    if rating < 1:
      return 0
    else:
      return 1

df['sentiment'] = df[f'ren_pat_cites_{prediction_year-1}'].apply(to_sentiment)

# Drop observations from last 4 years, for which there are no (virtually) patent renewals and so all are zeros
print('prediction year: ', prediction_year)
print('cuttof year: ', training_cuttoff_year)
df = df[df['pub_year'] <= training_cuttoff_year]
# print(df)
# Keep only variable for training. pub_id will be used to merge abstracts
df = df[['pub_id','sentiment']]

# Get a list of IDs from the DataFrame
ids = df['pub_id'].tolist()
id_list = ', '.join([f'"{str(id)}"' for id in ids])

# # Construct a parameterized SQL query
# query_abstracts = f'SELECT pub_id, abstract FROM `{project_id}.{dataset_id}.{table_id}` WHERE pub_id IN ({id_list})'

# # Run the query
# query_job = client.query(query_abstracts, job_config=job_config)
# results = query_job.result()

## Run paramatrized query for when id_list is too large and exceds query characters limit
query = f'SELECT pub_id, abstract FROM `{project_id}.{dataset_id}.{table_id}` WHERE pub_id IN UNNEST(@id_list)'

job_config = bigquery.QueryJobConfig(
    query_parameters=[
        bigquery.ArrayQueryParameter("id_list", "STRING", ids),
    ]
)
query_job = client.query(query, job_config=job_config)  # Make an API request.
results = query_job.result()



# Convert the result to a pandas DataFrame
abstracts = results.to_dataframe()
df = pd.merge(df, abstracts, on='pub_id', how='left')
df = df.rename(columns={'abstract': 'content'})
df

## Load data into a pandas dataframe named `df`. Needs two columns named `sentiment` and `content`. The following is just an example



prediction year:  2003
cuttof year:  1998


Unnamed: 0,pub_id,sentiment,content
0,pub.1046725730,0,To the Editor. —Triplett and associates1 have ...
1,pub.1116046530,0,To study oral thermometry and to obtain standa...
2,pub.1092279981,0,"Adenosine triphosphate and 2,3-diphosphoglycer..."
3,pub.1015959386,0,The system specification is one of the key doc...
4,pub.1031444979,0,"Cyclic acetylenic monoamines, synthesized by t..."
...,...,...,...
66112,pub.1000953279,0,Thermograms of high-temperature synthesis of n...
66113,pub.1034848076,0,Many studies demonstrate that differentiation ...
66114,pub.1094736775,0,The expressiveness of conceptual graphs allows...
66115,pub.1005364401,0,We have previously reported that neurokinin A ...


In [19]:
# prompt: create a dataframe with 1000 observations and two columns, one named sentiment and the other named content. Populate the sentiment column with 0 or 1, random. Get more 0s than 1s. Populate the content column with random text, of length about 512 tokens.

import pandas as pd
import numpy as np
import random

# Set the number of observations
num_observations = 1000

# Generate random sentiments with more 0s than 1s
sentiments = np.random.choice([0, 1], size=num_observations, p=[0.7, 0.3])

# Generate random text content
content = []
for _ in range(num_observations):
    num_words = random.randint(400, 600)  # Approximate number of words for ~512 tokens
    text = " ".join(["scientificabstract" for _ in range(num_words)])  # Replace "word" with actual word generation if needed
    content.append(text)

# Create the DataFrame
df_example = pd.DataFrame({'sentiment': sentiments, 'content': content})

print(df_example.head())


   sentiment                                            content
0          0  scientificabstract scientificabstract scientif...
1          0  scientificabstract scientificabstract scientif...
2          1  scientificabstract scientificabstract scientif...
3          1  scientificabstract scientificabstract scientif...
4          0  scientificabstract scientificabstract scientif...


## Training

In [5]:
# Balance classes by performing undersampling

class_0_to_drop = sum(df['sentiment']==1)/sum(df['sentiment']==0)
print('class_0_to_drop for balancing: ', class_0_to_drop)
df = df.drop(df.query('sentiment == 0').sample(frac=1-class_0_to_drop).index)
print('Class 0 after balancing drop: ', sum(df['sentiment']==0))
print('Class 1 after balancing drop: ', sum(df['sentiment']==1))
df


# Drop observations to reduce overall sample size
sample_size = 500 # by class 10000
to_drop = sample_size/sum(df['sentiment']==0)
print('to_drop for size reduction: ', to_drop)
df = df.drop(df.query('sentiment == 0').sample(frac=1-to_drop).index)
df = df.drop(df.query('sentiment == 1').sample(frac=1-to_drop).index)
print('Class 0 after size drop: ', sum(df['sentiment']==0))
print('Class 1 after size drop: ', sum(df['sentiment']==1))
df


class_0_to_drop for balancing:  0.08094366151129713
Class 0 after balancing drop:  4951
Class 1 after balancing drop:  4951
to_drop for size reduction:  0.10098969905069682
Class 0 after size drop:  500
Class 1 after size drop:  500


Unnamed: 0,pub_id,sentiment,content
1,pub.1116046530,0,To study oral thermometry and to obtain standa...
190,pub.1086255329,0,The author presents the results of experimenta...
324,pub.1043494754,0,Positron annihilation lifetime measurements we...
348,pub.1006847269,0,Pressure or volume overload of the myocardium ...
368,pub.1048655271,0,Equilibrium distribution of palladium(II) was ...
...,...,...,...
65659,pub.1019329911,1,Multidrug resistance (MDR) is a significant pr...
65696,pub.1002615683,0,"A new species of Monohelea Kieffer, M. uruguay..."
66000,pub.1029935016,0,The ternary system Pd-Zn-Se was investigated a...
66007,pub.1093400242,0,This paper deals with the synthesis of optimal...


In [6]:
MAX_LEN = 512
TEST_SIZE = 0.25

epochs = 5
drop_out_rate = 0.3
lr = 2e-5
model_used = 'scibert'
batch_size = 16

rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7dfd24823430>

In [16]:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')
print(f'Class 0 size: {sum(df["sentiment"]==0)}')
print(f'Class 1 size: {sum(df["sentiment"]==1)}')



pre_trained_model_name = 'scibert_scivocab_uncased'
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')


class GPReviewDataset(Dataset):

  def __init__(self, reviews, targets, tokenizer, max_len): #create constructor/methods. assign variables:
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self): #returns number of reviews, length of data set
    return len(self.reviews)

  def __getitem__(self, item): #takes the index of the element from the data set
    review = str(self.reviews[item])[:MAX_LEN]
    target = self.targets[item]

    # Replace problematic character with a space
    review = review.replace('\x85', ' ')

    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      truncation=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt'
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long) #sentiment for the review, converted into tensor of type long bc is a classification problem
    }

df_train, df_test = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

print(f'Training shape: {df_train.shape}')
print(f'Validation shape: {df_val.shape}')
print(f'Test shape: {df_test.shape}')

def create_data_loader(df, tokenizer, max_len, batch_size): #take dataframe, tokenizer and other vars and
  ds = GPReviewDataset(
    reviews=df.content.to_numpy(), #get numpy values of content
    targets=df.sentiment.to_numpy(), #get targets/sentiments values - integers
    tokenizer=tokenizer, #pass tokenizer
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, batch_size)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, batch_size)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, batch_size)

data = next(iter(train_data_loader))
data.keys()

bert_model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased', return_dict=False)  #return_dict=False needed for compatibility. Check https://huggingface.co/docs/transformers/migration


class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased', return_dict=False)
    self.drop = nn.Dropout(p=drop_out_rate)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)


class_names = ['Low compot', 'High compot']

model = SentimentClassifier(len(class_names))
model = model.to(device)

input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

F.sigmoid(model(input_ids, attention_mask))

optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
total_steps = len(train_data_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

print(f'max_token_len: {MAX_LEN}; test_size: {TEST_SIZE}; batch_size: {batch_size}')
print(f'epochs: {epochs}; lr: {lr}; drop_out_rate: {drop_out_rate}')

history = defaultdict(list)
best_accuracy = 0

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    start_time = time.time()
    print(datetime.fromtimestamp(start_time, tz=pytz.utc).astimezone(et_timezone).strftime('%H:%M:%S'))

    train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train))

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, device, len(df_val))

    print(f'Val loss {val_loss} accuracy {val_acc}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc >= best_accuracy:
      torch.save(model.state_dict(), 'best_model')
      best_accuracy = val_acc

    end_time = time.time()
    print(datetime.fromtimestamp(end_time, tz=pytz.utc).astimezone(et_timezone).strftime('%H:%M:%S'))
    epoch_time = end_time - start_time
    print(f"Time taken for epoch {epoch + 1}: {epoch_time:.2f} seconds")


device: cuda:0
Class 0 size: 500
Class 1 size: 500
Training shape: (750, 3)
Validation shape: (125, 3)
Test shape: (125, 3)


  self.pid = os.fork()


max_token_len: 512; test_size: 0.25; batch_size: 16
epochs: 5; lr: 2e-05; drop_out_rate: 0.3
Epoch 1/5
----------
16:56:24




Train loss 0.7129862302161277 accuracy 0.576
Val loss 0.5472504198551178 accuracy 0.712
16:56:41
Time taken for epoch 1: 16.59 seconds
Epoch 2/5
----------
16:56:41
Train loss 0.5322573330808194 accuracy 0.7586666666666666
Val loss 0.6064506024122238 accuracy 0.72
16:56:57
Time taken for epoch 2: 16.48 seconds
Epoch 3/5
----------
16:56:57
Train loss 0.4046563093966626 accuracy 0.8533333333333333
Val loss 0.7281473875045776 accuracy 0.6960000000000001
16:57:13
Time taken for epoch 3: 15.61 seconds
Epoch 4/5
----------
16:57:13
Train loss 0.27070300154229426 accuracy 0.9
Val loss 0.7163480184972286 accuracy 0.76
16:57:29
Time taken for epoch 4: 16.53 seconds
Epoch 5/5
----------
16:57:29
Train loss 0.16989144643253468 accuracy 0.9373333333333334
Val loss 0.800992339849472 accuracy 0.76
16:57:46
Time taken for epoch 5: 16.46 seconds


## Model evaluation

In [None]:
# Load best model
torch.cuda.empty_cache()
gc.collect()

model_location = 'v2/models/' + local_model_name
print('current model:', model_location)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


if model_used == "bert":
  class SentimentClassifier(nn.Module):

    def __init__(self, n_classes):
      super(SentimentClassifier, self).__init__()
      self.bert = BertModel.from_pretrained(pre_trained_model_name, return_dict=False)
      self.drop = nn.Dropout(p=0)
      self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
      _, pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      output = self.drop(pooled_output)
      return self.out(output)
elif model_used == 'scibert':
  class SentimentClassifier(nn.Module):

    def __init__(self, n_classes):
      super(SentimentClassifier, self).__init__()
      self.bert = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased', return_dict=False)
      self.drop = nn.Dropout(p=0)
      self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
      _, pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      output = self.drop(pooled_output)
      return self.out(output)
elif model_used == 'specter2_base':
  class SentimentClassifier(nn.Module):

    def __init__(self, n_classes):
      super(SentimentClassifier, self).__init__()
      self.bert = AutoModel.from_pretrained('allenai/specter2_base', return_dict=False)
      self.drop = nn.Dropout(p=0)
      self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
      _, pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      output = self.drop(pooled_output)
      return self.out(output)

model = SentimentClassifier(2)
model.load_state_dict(torch.load(model_location, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))
model = model.to(device)
model.eval()


In [None]:
# Test predictions
def get_predictions(model, data_loader):
  model = model.eval()

  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:

      texts = d["review_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask

      )
      _, preds = torch.max(outputs, dim=1)

      # probs = F.sigmoid(outputs)
      probs = F.softmax(outputs, dim=1)

      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(probs)
      real_values.extend(targets)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values

y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

# Classification report table
class_report = classification_report(y_test, y_pred, target_names=class_names)
print(class_report)


# Confusion matrix
# def show_confusion_matrix(confusion_matrix):
#   hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
#   hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
#   hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
#   plt.ylabel('True sentiment')
#   plt.xlabel('Predicted sentiment');

# cm = confusion_matrix(y_test, y_pred)
# df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
# show_confusion_matrix(df_cm)


# ROC
from sklearn.metrics import roc_auc_score

micro_roc_auc_ovr = roc_auc_score(
    y_test,
    y_pred,
    multi_class="ovr",
    average="micro",
)

# print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")

class_report = classification_report(y_test, y_pred, target_names=class_names, output_dict=True)
class_report_df = pd.DataFrame(class_report).transpose()

class_report_df.index = f'{prediction_year} ' + class_report_df.index
class_report_df['auroc'] = micro_roc_auc_ovr
class_report_df['year'] = prediction_year

print(class_report_df)

class_report_df.to_csv(f'v2/class_reports/compot_classreport_{prediction_year}.csv', encoding='utf-8', index=True)

IC = type('IdentityClassifier', (), {"predict": lambda i : i, "_estimator_type": "classifier"})
labels_cm = ['Not cited by \n renewed patent', 'Cited by \n renewed patent']
cm = ConfusionMatrixDisplay.from_estimator(IC, y_pred, y_test, display_labels = labels_cm, normalize='all', values_format='.2%', cmap=plt.cm.Blues)
# disp = cm.plot(include_values=True, cmap=plt.cm.Blues, ax=None, values_format='.2%', xticks_rotation='horizontal')
plt.grid(False)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.gcf().savefig(f'v2/conf_mat/compot_confmat_{prediction_year}.png', dpi=100)




##Inference / Predictions


In [None]:
# INFERENCE
## Get all papers published in prediction_year + 1 and cast predictions.
## Only papers with at least one author affiliated with a US institution at the time of publication

In [None]:
# Load data for predictions

In [None]:
# Set GCP project and BigQuery table details
project_id = 'comsci-353300'
dataset_id = 'derived'
table_id = 'pubs'

# Authenticate to Google Cloud
client = bigquery.Client(project=project_id)

# Set allow_large_results to True
job_config = bigquery.QueryJobConfig()
job_config.allow_large_results = True

# Construct the SQL query to fetch data from BigQuery
query = f'SELECT id as pub_id, abstract.preferred as abstract FROM `{project_id}.{dataset_id}.{table_id}` WHERE year = {prediction_year} AND id IN (SELECT pub_id FROM `{project_id}.{dataset_id}.pub_org` WHERE country_code = "US")'

# Fetch data from BigQuery
query_job = client.query(query, job_config=job_config)
results = query_job.result()  # Waits for the query to complete

# Convert the result to a pandas DataFrame
df_inf = results.to_dataframe()

# Print the first few rows of the DataFrame
print(df_inf)

In [None]:
# Predictions

In [None]:
i = 0
current_digit = '0.0'
print('Start time: ', datetime.fromtimestamp(time.time(), tz=pytz.utc).astimezone(et_timezone).strftime('%H:%M:%S'))
for abstract in df_inf["abstract"]:
    try:
        encoded_review = tokenizer.encode_plus(
        abstract,
        max_length=MAX_LEN,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
        )

        input_ids = encoded_review['input_ids']
        attention_mask = encoded_review['attention_mask']

        with torch.no_grad():
            output = model(
                input_ids.to(device),
                attention_mask.to(device)
            )

        _, prediction = torch.max(output, dim=1)

        prob_infer = F.softmax(output, dim=1)
        # prob_infer = F.sigmoid(output)

        df_inf.loc[i,"abstract_inferred"] = abstract
        df_inf.loc[i,"class_pat_ren"] = prediction.cpu().detach().numpy()[0]
        df_inf.loc[i,"prob_low_pat_ren"] = prob_infer.cpu().detach().numpy().T[0][0]
        df_inf.loc[i,"prob_high_pat_ren"] = prob_infer.cpu().detach().numpy().T[1][0]

        i = i+1

        perc = 100*i/len(df_inf["abstract"])

        if str(perc)[0:2] != current_digit:
            print("Counter:", perc)
            current_digit = str(perc)[0:2]

    except:
        print(output)
        # last_layer_embeddings = output.last_hidden_state
        print(error)
        df_inf.loc[i,"abstract_inferred"] = abstract
        df_inf.loc[i,"class_pat_ren"] = ""
        df_inf.loc[i,"prob_low_pat_ren"] = ""
        df_inf.loc[i,"prob_high_pat_ren"] = ""

        i = i+1

        print('error in line: ', i)

print('Finish time: ', datetime.fromtimestamp(time.time(), tz=pytz.utc).astimezone(et_timezone).strftime('%H:%M:%S'))
df_inf



In [None]:
# Load predictions to BQ

In [None]:
df_inf = df_inf.drop(columns=['abstract_inferred'])
df_inf

In [None]:
# Plot the density using the plot.kde() function
df_inf['prob_high_pat_ren'].plot.kde()

# Add labels and a title
plt.xlabel('X-axis')
plt.ylabel('Density')
plt.title('Density Plot')

# Display the plot
plt.show()

In [None]:
# Store preds locally
df_inf.drop(columns=['abstract']).to_csv(f'v2/preds/compot_preds_{prediction_year}.csv', encoding='utf-8')

In [None]:
# Upload preds to GBQ

# Set GCP project and BigQuery table details
project_id = 'comsci-353300'
dataset_id = 'derived_preds'
table_id = f'compot_preds_{prediction_year}'

# Authenticate with Google Cloud using your service account key
client = bigquery.Client(project=project_id)

# Specify the BigQuery table reference
table_ref = client.dataset(dataset_id).table(table_id)

# Upload the Pandas DataFrame to BigQuery with auto-detection
job_config = bigquery.LoadJobConfig(
    autodetect=True,  # Enable auto-detection of the schema
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE  # You can choose your write disposition
)

# Upload the DataFrame to BigQuery
job = client.load_table_from_dataframe(df_inf.drop(columns=['abstract']), table_ref, job_config=job_config)
job.result()  # Wait for the job to complete