In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

# Load the CSV data
csv_path = "/kaggle/input/trialscsv/trials.csv"
data = pd.read_csv(csv_path)
print("CSV file loaded successfully.")

# Columns to generate embeddings for
columns_to_embed = ['Study Title', 'Primary Outcome Measures', 'Secondary Outcome Measures', 'criteria']

# Load PubMed BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-health-zh")
model = AutoModel.from_pretrained("nghuyong/ernie-health-zh")
print("ernie-health kaam pe lag gaya")

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

# Function to generate embeddings for a batch of texts
def generate_batch_embeddings(texts, tokenizer, model, batch_size=8):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Mean pooling
        all_embeddings.extend(batch_embeddings)
    return all_embeddings

# Prepare the data for processing
texts_to_embed = data[columns_to_embed].apply(lambda row: " ".join([str(row[col]) for col in columns_to_embed if pd.notna(row[col])]), axis=1)
nct_numbers = data['NCT Number'].tolist()

# Create a DataFrame to store embeddings
embedding_data = []

# Process the entire dataset in batches
total_rows = len(data)
batch_size = 16  # Adjust batch size based on available GPU memory

# Process in batches
for i in tqdm(range(0, total_rows, batch_size), total=total_rows//batch_size):
    batch_texts = texts_to_embed[i:i+batch_size].tolist()
    batch_embeddings = generate_batch_embeddings(batch_texts, tokenizer, model, batch_size)
    embedding_data.extend(batch_embeddings)

    # Print progress every 10 batches processed
    if (i + batch_size) % 10 == 0:
        print(f"Processed {i + batch_size}/{total_rows} rows.")

# Convert embeddings to DataFrame and preserve NCT Number
embedding_df = pd.DataFrame(embedding_data)
embedding_df.insert(0, 'NCT Number', nct_numbers)

# Save to CSV
output_csv_path = "embeddings_output2.csv"
embedding_df.to_csv(output_csv_path, index=False)
print(f"Embeddings saved to {output_csv_path}")