In [1]:
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import transformers
print(transformers.__version__)
from transformers.utils import send_example_telemetry
from transformers import TrainingArguments, Trainer, default_data_collator
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoConfig
from datasets import load_dataset, load_metric
from datasets import ClassLabel, Sequence
from sklearn.metrics import accuracy_score, f1_score

4.36.0


In [2]:
data=pd.read_csv('/kaggle/input/nlp-mental-health-conversations/train.csv')
data=data.dropna()
n=len(data)
print(n)
N=list(range(n))
random.shuffle(N)
data=data.iloc[N][0:1000].reset_index(drop=True)
display(data)
print(data.columns.tolist())
print('context')
print(data.iloc[0,0])
print()
print('response')
print(data.iloc[0,1])

3508


Unnamed: 0,Context,Response
0,I feel that I need to end my present relations...,It sounds like you have some insight into the ...
1,I recently lost a friend to suicide. I'm smok...,Suicide is not a natural way to pass from this...
2,My depression has been reoccurring for a long ...,I couldn't help but notice that you did not sp...
3,I am a really shy person. I'm currently in a g...,Have you tried rehearsing to yourself or a tru...
4,My husband and I have been together for seven ...,"Hi Texas, Thanks for your honesty; it helps me..."
...,...,...
995,I believe it is wrong for men to look at inapp...,"In my book, this is a boundary issue. Although..."
996,I crossdress and like to be feminine but I am ...,Your happiness and healthiness is key. I woul...
997,My spouse decided he no longer wanted me six y...,What a burden for you!Your husband cannot seem...
998,I'm obsessing about a terrible breakup. Everyt...,The best way to move on is to give yourself su...


['Context', 'Response']
context
I feel that I need to end my present relationship. He lives three hours away and likes the reassurance of having someone to talk to multiple times per day and seeing me once or twice a month. I want someone who is more present and more of a life companion. Lately, he has had a very busy work schedule and I have only seen him a few times in the last 6 weeks. I told him that I can't continue in this way because I constantly feel frustrated and angry and that he is not making the relationship enough of a priority. I also feel it is keeping me from possibly finding the relationship I want. We have been together 7 years. The problem is that I panic and experience anxiety and depression thinking of him with someone else and then thinking I will never meet someone I like. We have gone through this cycle already 4-5 times and I feel it is unhealthy to stay in it, but my aversion to the anxiety and depression I experience upon separation always leads me to reconc

In [3]:
!pip install transformers



In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

In [6]:
class CustomModel(AutoModelForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        self.max_input_length = 514
        self.config.max_position_embeddings = self.max_input_length  # Align for consistency

    def forward(self, **inputs):
        input_ids = inputs.get("input_ids")
        if input_ids is not None and input_ids.size(1) > self.max_input_length:
            input_ids = input_ids[:, :self.max_input_length]

        # Debug information
        print("Input_ids size:", input_ids.size())
        print("Attention_mask size:", inputs.get("attention_mask").size() if "attention_mask" in inputs else None)

        return super().forward(**inputs)

# Load model and tokenizer
model_name = "roberta-base"
config = AutoConfig.from_pretrained(model_name)
model = CustomModel.from_pretrained(model_name, config=config)  # Pass adjusted config
tokenizer = AutoTokenizer.from_pretrained(model_name)


If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [7]:
def format_df(dataframe, tokenizer, max_sequence_length=512):
    formatted_data = []
    
    for _, row in dataframe.iterrows():
        input_text = f"{row['Context']} "
        
        if len(input_text) > max_sequence_length:
            input_text = input_text[:max_sequence_length]

        inputs = tokenizer(input_text, return_tensors="pt", max_length=max_sequence_length, truncation=True)    
 
        if 'Response' in row:
            answer_text = row['Response']
            answer_tokens = tokenizer(answer_text, return_tensors="pt")["input_ids"] 
        else:
            answer_text, answer_tokens = None, None

        formatted_data.append({
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            #'labels': answer_tokens,
        })

    return formatted_data

In [8]:
formatted_df = format_df(data, tokenizer)
print(formatted_df[0])

Token indices sequence length is longer than the specified maximum sequence length for this model (667 > 512). Running this sequence through the model will result in indexing errors


{'input_ids': tensor([[    0,   100,   619,    14,    38,   240,     7,   253,   127,  1455,
          1291,     4,    91,  1074,   130,   722,   409,     8,  3829,     5,
         13057, 12590,     9,   519,   951,     7,  1067,     7,  1533,   498,
           228,   183,     8,  1782,   162,   683,    50,  2330,    10,   353,
             4,    38,   236,   951,    54,    16,    55,  1455,     8,    55,
             9,    10,   301, 15625,     4,   226,  7223,     6,    37,    34,
            56,    10,   182,  3610,   173,  3078,     8,    38,    33,   129,
           450,   123,    10,   367,   498,    11,     5,    94,   231,   688,
             4,    38,   174,   123,    14,    38,    64,    75,   535,    11,
            42,   169,   142,    38,  5861,   619,  8164,     8,  5800,     8,
            14,    37,    16,    45,   442,     5,  1291,   615,     9,    10,
          2052,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [9]:
class CustomDataset(Dataset):
    def __init__(self, formatted_data):
        self.formatted_data = formatted_data

    def __len__(self):
        return len(self.formatted_data)

    def __getitem__(self, idx):
        return {
            'input_ids': self.formatted_data[idx]['input_ids'].squeeze(),
            'attention_mask': self.formatted_data[idx]['attention_mask'].squeeze(), 
            #'labels': self.formatted_data[idx]['labels'].squeeze() if self.formatted_data[idx]['labels'] is not None else None,
            'start_positions': self.formatted_data[idx]['start_positions'] if 'start_positions' in self.formatted_data[idx] else None,
            'end_positions': self.formatted_data[idx]['end_positions'] if 'end_positions' in self.formatted_data[idx] else None
        }
    
n=len(formatted_df)
trainset = CustomDataset(formatted_df[0:n*4//5])
testset = CustomDataset(formatted_df[n*4//5:])

In [10]:
from torch.nn.utils.rnn import pad_sequence

def custom_collate_fn(batch):
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True)
    attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True)

    #labels = pad_sequence([item['labels'] for item in batch if item['labels'] is not None], batch_first=True) if any(item['labels'] is not None for item in batch) else None

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        #'labels': labels,
    }
        
batch_size = 8  

train_dataloader = DataLoader(trainset, batch_size=batch_size, 
                              shuffle=True, collate_fn=custom_collate_fn)
test_dataloader = DataLoader(testset, batch_size=batch_size, 
                              shuffle=False, collate_fn=custom_collate_fn)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
model.to(device)
model.train()

RobertaForCausalLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [12]:
for batch in train_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    #labels = batch['labels'].to(device) if batch['labels'] is not None else None

    outputs = model(input_ids=input_ids, attention_mask=attention_mask,)# labels=labels

In [13]:
model.eval()
all_predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Get the predicted token IDs
        logits = outputs.logits
        predicted_ids = torch.argmax(logits, dim=-1)

        # Convert token IDs to tokens
        predicted_tokens = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)

        # Extend the list of predictions
        all_predictions.extend(predicted_tokens)