<a href="https://colab.research.google.com/github/samanthajmichael/machine_learning/blob/main/notebooks/FinBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install tdqm

In [None]:
import pandas as pd
def load_github_data(url):
    """
    Load data from GitHub raw content URL
    Example URL: https://raw.githubusercontent.com/samanthajmichael/machine_learning/main/data/complaints.csv
    """
    return pd.read_csv(url)

In [None]:
url = "https://raw.githubusercontent.com/samanthajmichael/machine_learning/main/data/complaints.csv"
df = pd.read_csv(url)

In [None]:
df = df.copy(deep=True)
df = df.rename(columns={'Date received': 'Date', 'Consumer complaint narrative': 'Complaint'})
df = df.loc[(df['Product']=='Bank account or service') |
            (df['Product']=='Checking or savings account') |
            (df['Product']=='Money transfers') |
            (df['Product']=='Money transfer, virtual currency, or money service')]
df = df[['Date', 'Product', 'Complaint']]
df = df.set_index(pd.to_datetime(df['Date'], format='mixed'))
df.drop(['Date'], axis=1, inplace=True)

In [None]:
print("Data shape:", df.shape)
print("\nSample of preprocessed data:")
print(df.head())

Data shape: (20163, 2)

Sample of preprocessed data:
                                Product  \
Date                                      
2022-08-22  Checking or savings account   
2024-11-25  Checking or savings account   
2023-10-31  Checking or savings account   
2022-10-18  Checking or savings account   
2023-11-08  Checking or savings account   

                                                    Complaint  
Date                                                           
2022-08-22  On XX/XX/2022 I moved {$500.00} from my XXXX X...  
2024-11-25  I had XXXX accounts opened on Wells Fargo and ...  
2023-10-31  I was the victim of false charges to my accoun...  
2022-10-18  I received four emails on XX/XX/XXXX about a W...  
2023-11-08  On XX/XX/, I reached out to Wells Fargo in reg...  


In [None]:
df.head()

Unnamed: 0_level_0,Product,Complaint
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-08-22,Checking or savings account,On XX/XX/2022 I moved {$500.00} from my XXXX X...
2024-11-25,Checking or savings account,I had XXXX accounts opened on Wells Fargo and ...
2023-10-31,Checking or savings account,I was the victim of false charges to my accoun...
2022-10-18,Checking or savings account,I received four emails on XX/XX/XXXX about a W...
2023-11-08,Checking or savings account,"On XX/XX/, I reached out to Wells Fargo in reg..."


In [None]:
def preprocess_text(text):
    if pd.isna(text):
        return text

    # Remove specific identifiers and noise
    text = re.sub(r'Wells\s*\.*\s*Fargo|W\s*\.*\s*F\s*\.*|xxx+|\d+', '', text, flags=re.IGNORECASE)

    # Normalize whitespace
    text = ' '.join(text.split())

    return text

In [None]:
import torch
import numpy as np
import transformers
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from transformers import AutoTokenizer, BertConfig, BertModel, BertPreTrainedModel
from torch.utils.data import Dataset, DataLoader
from transformers.optimization import get_linear_schedule_with_warmup
import torch.nn as nn
import torch.optim
from tqdm import tqdm
import pkg_resources
import re

# Model Definition
class FinBertSentimentRegression(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.regressor = nn.Sequential(
            nn.Linear(config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1),
            nn.Tanh() # Contrains the output layer to [-1,1]
        )
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        regression_output = self.regressor(pooled_output)

        loss = None
        if labels is not None:
            # MSE Loss with labels contrained to [-1, 1]
            labels = torch.clamp(labels, min=-1, max=1)
            loss_fct = nn.MSELoss()
            loss = loss_fct(regression_output.view(-1), labels.view(-1))
        else:
            loss = torch.mean(regression_output)

        return loss, regression_output

    def save_model(self, path):
        torch.save(self.state_dict(), path)

    @classmethod
    def load_model(cls, model_path, config):
        model = cls(config)
        model.load_state_dict(torch.load(model_path, weights_only=True, map_location='cpu'))
        return model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = BertConfig.from_pretrained("ProsusAI/finbert", num_labels=1)
model = FinBertSentimentRegression.from_pretrained("ProsusAI/finbert", config=config)
model.to(device)

class ComplaintDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx]) if isinstance(self.texts, pd.Series) else str(self.texts[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten()
        }

def analyze_complaints(df, test_size=0.2, batch_size=16, learning_rate=2e-5, num_epochs=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    processed_texts = df['Complaint'].apply(lambda x: preprocess_text(x) if pd.notna(x) else x)
    valid_mask = processed_texts.notna() & (processed_texts != '')
    processed_texts = processed_texts[valid_mask]

    dates = df[valid_mask].index
    texts = processed_texts

    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    config = BertConfig.from_pretrained("ProsusAI/finbert", num_labels=1)
    model = FinBertSentimentRegression.from_pretrained("ProsusAI/finbert", config=config)
    model.to(device)

    texts_train, texts_test, dates_train, dates_test = train_test_split(
        texts, dates, test_size=test_size, random_state=42
    )

    train_dataset = ComplaintDataset(texts_train, tokenizer=tokenizer)
    test_dataset = ComplaintDataset(texts_test, tokenizer=tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_loader) * num_epochs
    )

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            loss, _ = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Average Loss: {avg_loss:.4f}")

    def process_dataloader(loader, dates):
        predictions = []
        texts_processed = []
        losses = []

        model.eval()
        with torch.no_grad():
            for batch in tqdm(loader, desc="Processing"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)

                loss, outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids
                )

                predictions.extend(outputs.cpu().numpy().flatten())
                texts_processed.extend(batch['text'])
                if loss is not None:
                    losses.append(loss.item())

        results_df = pd.DataFrame({
            'processed_text': texts_processed,
            'sentiment_score': predictions,
        }, index=dates)

        results_df['severity'] = pd.cut(
            results_df['sentiment_score'],
            bins=5,
            labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']
        )

        return results_df, np.mean(losses) if losses else 0

    train_results, train_loss = process_dataloader(train_loader, dates_train)
    test_results, test_loss = process_dataloader(test_loader, dates_test)

    monthly_metrics = pd.concat([
        train_results.groupby(pd.Grouper(freq='ME'))['sentiment_score'].agg(['mean', 'std', 'count']),
        test_results.groupby(pd.Grouper(freq='ME'))['sentiment_score'].agg(['mean', 'std', 'count'])
    ]).sort_index()

    metrics = {
        'train_loss': train_loss,
        'test_loss': test_loss,
        'train_mse': mean_squared_error(train_results['sentiment_score'], [0] * len(train_results)),
        'test_mse': mean_squared_error(test_results['sentiment_score'], [0] * len(test_results))
    }

    return train_results, test_results, monthly_metrics, metrics

Some weights of FinBertSentimentRegression were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['regressor.0.bias', 'regressor.0.weight', 'regressor.3.bias', 'regressor.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# # For Colab to local Windows
from google.colab import files

# # Save temporarily in Colab
temp_path = '/content/finbert_sentiment_regression_model'
model.save_model(temp_path)

# # Download to local machine
files.download('finbert_sentiment_regression_model')

print("Model downloaded - please move file to:")
print(r"C:\Users\saman\Documents\WF Project\finbert_sentiment_regression_model")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model downloaded - please move file to:
C:\Users\saman\Documents\WF Project\finbert_sentiment_regression_model


In [None]:
# # Load - must move model to the colab environment first
# loaded_model = FinBertSentimentRegression.load_model('/content/finbert_sentiment_regression_model', config)

In [None]:
# Run analysis
train_results, test_results, monthly_metrics, metrics = analyze_complaints(df)

# Save results to CSV
train_results.to_csv('train_results.csv')
test_results.to_csv('test_results.csv')
monthly_metrics.to_csv('monthly_metrics.csv')

# Save metrics to CSV
pd.DataFrame.from_dict(metrics, orient='index', columns=['value']).to_csv('model_metrics.csv')

# Print summary statistics
print("\nModel Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nMonthly Metrics Sample:")
print(monthly_metrics.head())

Some weights of FinBertSentimentRegression were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['regressor.0.bias', 'regressor.0.weight', 'regressor.3.bias', 'regressor.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 1009/1009 [11:35<00:00,  1.45it/s]


Average Loss: -0.9800


Epoch 2/3: 100%|██████████| 1009/1009 [11:34<00:00,  1.45it/s]


Average Loss: -0.9999


Epoch 3/3: 100%|██████████| 1009/1009 [11:33<00:00,  1.45it/s]


Average Loss: -1.0000


Processing: 100%|██████████| 1009/1009 [03:39<00:00,  4.60it/s]
Processing: 100%|██████████| 253/253 [00:54<00:00,  4.61it/s]



Model Metrics:
train_loss: -1.0000
test_loss: -1.0000
train_mse: 1.0000
test_mse: 1.0000

Monthly Metrics Sample:
                mean           std  count
Date                                     
2015-03-31 -0.999981  1.467365e-07     12
2015-03-31 -0.999981  0.000000e+00      4
2015-04-30 -0.999981  8.796994e-08     34
2015-04-30 -0.999981  3.935249e-08     13
2015-05-31 -0.999981  1.083776e-07     50


In [None]:
def analyze_full_dataset(df):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  processed_texts = df['Complaint'].apply(lambda x: preprocess_text(x) if pd.notna(x) else x)
  valid_mask = processed_texts.notna() & (processed_texts != '')
  dates = df[valid_mask].index
  texts = processed_texts[valid_mask]

  tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
  config = BertConfig.from_pretrained("ProsusAI/finbert", num_labels=1)
  model = FinBertSentimentRegression.from_pretrained("ProsusAI/finbert", config=config)
  model.to(device)

  dataset = ComplaintDataset(texts, tokenizer=tokenizer)
  dataloader = DataLoader(dataset, batch_size=16)

  predictions = []
  texts_processed = []

  model.eval()
  with torch.no_grad():
      for batch in tqdm(dataloader, desc="Processing"):
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          token_type_ids = batch['token_type_ids'].to(device)

          _, outputs = model(input_ids, attention_mask, token_type_ids)
          predictions.extend(outputs.cpu().numpy().flatten())
          texts_processed.extend(batch['text'])

  results_df = pd.DataFrame({
      'processed_text': texts_processed,
      'sentiment_score': predictions,
  }, index=dates)

  results_df['severity'] = pd.cut(results_df['sentiment_score'], bins=5,
                                  labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

  results_df.to_csv('full_dataset_results.csv')
  return results_df

In [None]:
# Run analysis on full dataset
full_results = analyze_full_dataset(df)

In [None]:
# Labels:
# Very Low: Least severe complaints (lowest negative sentiment)
# Low: Slightly negative complaints
# Medium: Neutral or mildly negative complaints
# High: More significantly negative complaints
# Very High: Most severe complaints (highest negative sentiment)