In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import torch
import torch.nn as nn

import transformers
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TrainingArguments,
                          Trainer,
                          AutoModelForMaskedLM,AutoConfig)

from datasets import load_dataset
from datasets import Dataset

from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
print(f"transformers=={transformers.__version__}")

transformers==4.48.2


In [6]:
!ls /root/Prodigal-Take-Home/sentiment-take-home/conversations | wc -l

146


# 1. Prepare dataset

In [7]:

# Read the CSV files into pandas DataFrames
train_df = pd.read_csv('/root/Prodigal-Take-Home/sentiment-take-home/sentiment-take-home/train_df.csv')
val_df = pd.read_csv('/root/Prodigal-Take-Home/sentiment-take-home/sentiment-take-home/val_df.csv')
test_df = pd.read_csv('/root/Prodigal-Take-Home/sentiment-take-home/sentiment-take-home/test_df.csv')

# Display information about each DataFrame
print("Train DataFrame Info:")
train_df.info()
print("\nValidation DataFrame Info:")
val_df.info()
print("\nTest DataFrame Info:")
test_df.info()

Train DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   _id          1600 non-null   object 
 1   duration     1600 non-null   float64
 2   disposition  1600 non-null   object 
 3   type         1600 non-null   object 
dtypes: float64(1), object(3)
memory usage: 50.1+ KB

Validation DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   _id          200 non-null    object 
 1   duration     200 non-null    float64
 2   disposition  200 non-null    object 
 3   type         200 non-null    object 
dtypes: float64(1), object(3)
memory usage: 6.4+ KB

Test DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   

In [8]:
def add_text_column(df, conversations_path):
    """
    Adds a 'text' column to the DataFrame by reading text from files.

    Args:
        df (pd.DataFrame): The DataFrame to modify.
        conversations_path (str): The path to the directory containing the text files.

    Returns:
        pd.DataFrame: The DataFrame with the added 'text' column.
    """
    df['text'] = df['_id'].apply(lambda x: open(os.path.join(conversations_path, f"{x}.txt")).read())
    return df

conversations_path = '/root/Prodigal-Take-Home/sentiment-take-home/sentiment-take-home/conversations'

train_df = add_text_column(train_df, conversations_path)
val_df = add_text_column(val_df, conversations_path)
test_df = add_text_column(test_df, conversations_path)

print("Train DataFrame with 'text' column:")
display(train_df.head())
print("\nValidation DataFrame with 'text' column:")
display(val_df.head())
print("\nTest DataFrame with 'text' column:")
display(test_df.head())

Train DataFrame with 'text' column:


Unnamed: 0,_id,duration,disposition,type,text
0,b790a36f-de41-408a-b1c9-96da476c0f52,234.396625,Promise - Payment plan,positive,agent 00:02-00:16\nthanks for calling agency X...
1,f796f64a-c71d-4921-b809-199e060406a3,237.871,Promise - Settlement in full,positive,agent 00:02-00:16\nthank you for calling agenc...
2,48a5cfc3-1232-44a7-beda-e391b8d6f933,296.136,Promise - Payment plan,positive,agent 00:01-00:09\nhello my name is loretta ag...
3,64195c08-8c82-418c-8e74-711f60a36786,174.106,Promise - Payment plan,positive,agent 00:03-00:15\nthank you for calling agenc...
4,7207df04-036b-4316-a83c-e4e54c6e63b4,28.08,No Pay - Cant pay now,negative,borrower 00:04-00:04\nhello\n\nagent 00:06-00:...



Validation DataFrame with 'text' column:


Unnamed: 0,_id,duration,disposition,type,text
0,1f8dd4d9-2493-4692-a22b-455d7a77c9b0,191.448,Promise - Payment in full,positive,borrower 00:01-00:01\nhello\n\nagent 00:02-00:...
1,30071157-48a0-4848-9cad-da03f19e50b0,465.371375,Promise - One time payment,positive,agent 00:01-00:20\nhello thank you so much for...
2,a4bd9f63-b970-4815-a657-70578cd58fef,128.522375,Promise - Settlement in payments,positive,borrower 00:02-00:04\nwhat the hell is that oh...
3,f1fa8c59-5567-442d-8788-529f40eab2e0,347.611375,No Pay - Dispute,negative,agent 00:07-00:20\nthank you for calling agenc...
4,f3429a48-b8df-434c-a329-899a5b87b92c,220.65625,No Pay - Not right now,negative,agent 00:05-00:20\nhello thank you for calling...



Test DataFrame with 'text' column:


Unnamed: 0,_id,duration,disposition,type,text
0,086203d1-c17c-4964-8e60-7bc32619129b,163.224,Promise - Payment in full,positive,agent 00:03-00:12\ngood afternoon this is agen...
1,7d35f519-3b2f-4383-a307-0cd5dd62fba8,285.544375,Promise - Payment in full,positive,agent 00:03-00:13\nmy wack agent id one two si...
2,903aeb1a-16e2-49cd-95fe-53f8f65a4da5,315.977125,Promise - Payment in full,positive,borrower 00:08-00:08\nhello\n\nagent 00:10-00:...
3,9fbf5b24-ae73-4ca2-8bd6-e9348143f220,235.389,No Pay - Dispute,negative,agent 00:03-00:17\nthank you for calling shore...
4,86ea8ebe-f4f0-4ee6-8018-e61939e2f673,19.224,No Pay - Cant pay now,negative,agent 00:01-00:08\nhello my name is loretta ag...


In [11]:
# Define the label mapping
id2label = {0: "negative", 1: "positive"}
label2id = {"negative": 0, "positive": 1}

In [12]:
train_df['labels'] = train_df['type'].map(label2id)
val_df['labels'] = val_df['type'].map(label2id)
test_df['labels'] = test_df['type'].map(label2id)

print("Train DataFrame with 'labels' column:")
display(train_df.head())
print("\nValidation DataFrame with 'labels' column:")
display(val_df.head())
print("\nTest DataFrame with 'labels' column:")
display(test_df.head())

Train DataFrame with 'labels' column:


Unnamed: 0,_id,duration,disposition,type,text,labels
0,b790a36f-de41-408a-b1c9-96da476c0f52,234.396625,Promise - Payment plan,positive,agent 00:02-00:16\nthanks for calling agency X...,1
1,f796f64a-c71d-4921-b809-199e060406a3,237.871,Promise - Settlement in full,positive,agent 00:02-00:16\nthank you for calling agenc...,1
2,48a5cfc3-1232-44a7-beda-e391b8d6f933,296.136,Promise - Payment plan,positive,agent 00:01-00:09\nhello my name is loretta ag...,1
3,64195c08-8c82-418c-8e74-711f60a36786,174.106,Promise - Payment plan,positive,agent 00:03-00:15\nthank you for calling agenc...,1
4,7207df04-036b-4316-a83c-e4e54c6e63b4,28.08,No Pay - Cant pay now,negative,borrower 00:04-00:04\nhello\n\nagent 00:06-00:...,0



Validation DataFrame with 'labels' column:


Unnamed: 0,_id,duration,disposition,type,text,labels
0,1f8dd4d9-2493-4692-a22b-455d7a77c9b0,191.448,Promise - Payment in full,positive,borrower 00:01-00:01\nhello\n\nagent 00:02-00:...,1
1,30071157-48a0-4848-9cad-da03f19e50b0,465.371375,Promise - One time payment,positive,agent 00:01-00:20\nhello thank you so much for...,1
2,a4bd9f63-b970-4815-a657-70578cd58fef,128.522375,Promise - Settlement in payments,positive,borrower 00:02-00:04\nwhat the hell is that oh...,1
3,f1fa8c59-5567-442d-8788-529f40eab2e0,347.611375,No Pay - Dispute,negative,agent 00:07-00:20\nthank you for calling agenc...,0
4,f3429a48-b8df-434c-a329-899a5b87b92c,220.65625,No Pay - Not right now,negative,agent 00:05-00:20\nhello thank you for calling...,0



Test DataFrame with 'labels' column:


Unnamed: 0,_id,duration,disposition,type,text,labels
0,086203d1-c17c-4964-8e60-7bc32619129b,163.224,Promise - Payment in full,positive,agent 00:03-00:12\ngood afternoon this is agen...,1
1,7d35f519-3b2f-4383-a307-0cd5dd62fba8,285.544375,Promise - Payment in full,positive,agent 00:03-00:13\nmy wack agent id one two si...,1
2,903aeb1a-16e2-49cd-95fe-53f8f65a4da5,315.977125,Promise - Payment in full,positive,borrower 00:08-00:08\nhello\n\nagent 00:10-00:...,1
3,9fbf5b24-ae73-4ca2-8bd6-e9348143f220,235.389,No Pay - Dispute,negative,agent 00:03-00:17\nthank you for calling shore...,0
4,86ea8ebe-f4f0-4ee6-8018-e61939e2f673,19.224,No Pay - Cant pay now,negative,agent 00:01-00:08\nhello my name is loretta ag...,0


# Train model to get borrower sentiment

Labels in the dataset indicate borrower sentiment, so we will train our model to predict a score for the same

## Exp. FineTune ModernBERT

In [13]:
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [None]:
# Load the model AND specify the number of labels
model = AutoModelForSequenceClassification.from_pretrained(
    "answerdotai/ModernBERT-base",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

In [None]:
tokenizer