In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import os
import pandas as pd
import numpy as np

In [None]:
# Load PLBART tokenizer and model
model_name = "uclanlp/plbart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


In [None]:
# Load data
df = pd.read_csv('/kaggle/input/data-class-code-smells/data_class.csv')
df['label'] = np.where(df.severity == 'none', 0, 1)


In [None]:
def embed_line_by_line_in_batches(df, df_path, batch_size=100):
    directory = os.path.dirname(df_path)
    if directory and not os.path.exists(directory):
        os.makedirs(directory)
    
    df = df.copy()
    df['embeded_sequence_sum'] = None
    df['embeded_sequence_avg'] = None

    total_rows = len(df)
    for start in range(0, total_rows, batch_size):
        end = min(start + batch_size, total_rows)
        df_batch = df.iloc[start:end]
        print(f"Processing batch {start} to {end} (total {total_rows})")
        
        for i, row in df_batch.iterrows():
            print(f"Processing row {i}: Sample ID {row['sample_id']}")
            lines = row['method'].split('\n')
            embeded = []
            for line in lines:
                try:
                    if len(line.strip()) > 0:  # Only process non-empty lines
                        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
                        with torch.no_grad():
                            outputs = model(**inputs)
                        embeddings = outputs.last_hidden_state[:, 0, :]  # Extract the CLS token embedding
                        embeded.append(embeddings.cpu().numpy())  # Convert embeddings to numpy
                except Exception as e:
                    print(f"Exception in line processing: {e}\nLine: {line}")
            
            if embeded:
                df.at[i, 'embeded_sequence_sum'] = np.sum(np.asarray(embeded), axis=0).tolist()
                df.at[i, 'embeded_sequence_avg'] = np.mean(np.asarray(embeded), axis=0).tolist()

        # Save the DataFrame after processing each batch
        pd.to_pickle(df, df_path)
        print(f"Batch {start}-{end} saved to {df_path}")

    print("Processing complete.")

In [None]:
# Embed line-by-line for the subset
embed_line_by_line_in_batches(
    df,
    '/kaggle/working/df_dc_embeded_by_line_plbart.pkl',
    batch_size=50,
)