<a href="https://colab.research.google.com/github/samanthajmichael/complaints/blob/main/notebooks/FinBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
%%capture
!pip install tdqm

In [37]:
import pandas as pd
def load_github_data(url):
    """
    Load data from GitHub raw content URL
    Example URL: https://raw.githubusercontent.com/samanthajmichael/machine_learning/main/data/complaints.csv
    """
    return pd.read_csv(url)

In [38]:
url = "https://raw.githubusercontent.com/samanthajmichael/machine_learning/main/data/complaints.csv"
df = pd.read_csv(url)

In [39]:
df = df.copy(deep=True)
df = df.rename(columns={'Date received': 'Date', 'Consumer complaint narrative': 'Complaint'})
df = df.loc[(df['Product']=='Bank account or service') |
            (df['Product']=='Checking or savings account') |
            (df['Product']=='Money transfers') |
            (df['Product']=='Money transfer, virtual currency, or money service')]
df = df[['Date', 'Product', 'Complaint']]
df = df.set_index(pd.to_datetime(df['Date'], format='mixed'))
df.drop(['Date'], axis=1, inplace=True)
df = df.dropna(subset=['Complaint'])

In [40]:
print("Data shape:", df.shape)
print("\nSample of preprocessed data:")
print(df.head())

Data shape: (20347, 2)

Sample of preprocessed data:
                                                      Product  \
Date                                                            
2022-10-26  Money transfer, virtual currency, or money ser...   
2023-02-10                        Checking or savings account   
2024-10-31                        Checking or savings account   
2023-02-02                        Checking or savings account   
2023-03-01                        Checking or savings account   

                                                    Complaint  
Date                                                           
2022-10-26  On XX/XX/2022, I was contacted by XXXX XXXX ; ...  
2023-02-10  I had been banking with Wells Fargo since XXXX...  
2024-10-31  From XXXX until XXXXXXXX XXXX XXXX someone had...  
2023-02-02  Several years ago opened an additional savings...  
2023-03-01  I stopped using my wells Fargo account because...  


In [41]:
df.head()

Unnamed: 0_level_0,Product,Complaint
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-10-26,"Money transfer, virtual currency, or money ser...","On XX/XX/2022, I was contacted by XXXX XXXX ; ..."
2023-02-10,Checking or savings account,I had been banking with Wells Fargo since XXXX...
2024-10-31,Checking or savings account,From XXXX until XXXXXXXX XXXX XXXX someone had...
2023-02-02,Checking or savings account,Several years ago opened an additional savings...
2023-03-01,Checking or savings account,I stopped using my wells Fargo account because...


In [42]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, BertConfig, BertModel, BertPreTrainedModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tqdm import tqdm
import re

def preprocess_text(text):
    if pd.isna(text):
        return text

    text = str(text).strip()

    # Standardize masked content while preserving case
    text = re.sub(r'X{2,}|x{2,}', 'XXX', text)

    # Keep currency amounts but standardize format
    text = re.sub(r'\{\$[\d,]+\.\d{2}\}', lambda m: m.group().strip('{}'), text)

    # Remove truly invalid characters while preserving sentiment indicators
    text = re.sub(r'[^\w\s.,!?$%\'\"()-]', ' ', text)

    # Normalize whitespace without lowercase conversion
    text = ' '.join(text.split())

    return text

class FinBertSentiment(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.regressor = nn.Sequential(
            nn.Linear(config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1) ## removed tanh activation to get more expected results.
        )
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        return self.regressor(pooled_output)

class ComplaintDataset(Dataset):
    def __init__(self, texts, tokenizer=None, max_len=256):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx]) if isinstance(self.texts, pd.Series) else str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors=None
        )
        return {
            'text': text,
            'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(encoding['token_type_ids'], dtype=torch.long)
        }

def analyze_complaints(df, batch_size=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    processed_texts = df['Complaint'].apply(preprocess_text)
    valid_mask = processed_texts.notna() & (processed_texts != '')
    processed_texts = processed_texts[valid_mask]
    dates = df[valid_mask].index

    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    config = BertConfig.from_pretrained("ProsusAI/finbert")
    model = FinBertSentiment.from_pretrained("ProsusAI/finbert", config=config)
    model.to(device)

    dataset = ComplaintDataset(processed_texts, tokenizer=tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    predictions = []
    texts_processed = []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing"):
            model_inputs = {k: v.to(device) for k, v in batch.items() if k != 'text'}
            outputs = model(**model_inputs)
            predictions.extend(outputs.cpu().numpy().flatten())
            texts_processed.extend(batch['text'])

    results_df = pd.DataFrame({
        'processed_text': texts_processed,
        'sentiment_score': predictions,
    }, index=dates)

    results_df['severity'] = pd.qcut(
        results_df['sentiment_score'],
        q=5,
        labels=['Extremely Negative', 'Very Negative', 'Negative', 'Moderately Negative', 'Slightly Negative']
    )

    monthly_metrics = results_df.groupby(pd.Grouper(freq='ME'))['sentiment_score'].agg([
        'mean', 'std', 'count'
    ]).sort_index()

    return results_df, monthly_metrics

In [43]:
# Run analysis
results_df, monthly_metrics = analyze_complaints(df)

# Save results
results_df.to_csv('sentiment_results.csv')
monthly_metrics.to_csv('monthly_metrics.csv')

print("\nMonthly Metrics Sample:")
print(monthly_metrics.head())

Some weights of FinBertSentiment were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['regressor.0.bias', 'regressor.0.weight', 'regressor.3.bias', 'regressor.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing: 100%|██████████| 636/636 [05:16<00:00,  2.01it/s]



Monthly Metrics Sample:
                mean       std  count
Date                                 
2015-03-31  0.137467  0.016375     16
2015-04-30  0.139712  0.016807     47
2015-05-31  0.142955  0.014754     58
2015-06-30  0.142191  0.013605     68
2015-07-31  0.141952  0.014040     69


In [44]:
# Labels:
# Very Low: Least severe complaints (lowest negative sentiment)
# Low: Slightly negative complaints
# Medium: Neutral or mildly negative complaints
# High: More significantly negative complaints
# Very High: Most severe complaints (highest negative sentiment)