<a href="https://colab.research.google.com/github/samanthajmichael/machine_learning/blob/main/notebooks/FinBERT_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

def load_github_data(url):
    """
    Load data from GitHub raw content URL
    Example URL: https://raw.githubusercontent.com/samanthajmichael/machine_learning/main/data/complaints.csv
    """
    return pd.read_csv(url)

In [2]:
url = "https://raw.githubusercontent.com/samanthajmichael/machine_learning/main/data/complaints.csv"
df = pd.read_csv(url)

In [3]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,10/19/24,Checking or savings account,Checking account,Opening an account,Unable to open an account,,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,IA,52214,,Consent not provided,Web,10/19/24,Closed with explanation,Yes,,10495346
1,09/01/24,Mortgage,Conventional home mortgage,Trouble during payment process,Payment process,1. I requested documentation and proof from We...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,IN,46375,Older American,Consent provided,Web,09/03/24,Closed with explanation,Yes,,9981722
2,10/18/24,Checking or savings account,Checking account,Closing an account,Company closed your account,Try to use my card and got declined and call t...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,TN,38118,,Consent provided,Web,10/18/24,Closed with explanation,Yes,,10500790
3,08/18/24,Checking or savings account,Checking account,Opening an account,Account opened without my consent or knowledge,W. F employees opened both personal and busine...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,CA,92025,,Consent provided,Web,08/18/24,Closed with explanation,Yes,,9840276
4,10/19/24,Mortgage,Conventional home mortgage,Struggling to pay mortgage,"An existing modification, forbearance plan, sh...",Wells Fargo had a major security breech during...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,AL,36067,,Consent provided,Web,10/19/24,Closed with explanation,Yes,,10512472


In [4]:
df = df.loc[(df['Product']=='Bank account or service') | (df['Product']=='Checking or savings account') | (df['Product']=='Money transfers') | (df['Product']=='Money transfer, virtual currency, or money service')]

In [5]:
df = df.rename(columns={
    'Date received': 'Date',
    'Consumer complaint narrative': 'Complaint'
})

In [6]:
df['Date'] = pd.to_datetime(df['Date'], format='mixed')

In [7]:
df = df[['Date', 'Product', 'Complaint']]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50266 entries, 0 to 136352
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       50266 non-null  datetime64[ns]
 1   Product    50266 non-null  object        
 2   Complaint  19615 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 1.5+ MB


In [9]:
print(df.head())

        Date                      Product  \
0 2024-10-19  Checking or savings account   
2 2024-10-18  Checking or savings account   
3 2024-08-18  Checking or savings account   
5 2024-08-18  Checking or savings account   
6 2024-10-18  Checking or savings account   

                                           Complaint  
0                                                NaN  
2  Try to use my card and got declined and call t...  
3  W. F employees opened both personal and busine...  
5  My. Wells. Fargo. Acount. Was. Hacked. Several...  
6  I receive direct deposits from my job every Fr...  


In [10]:
print(df.columns)

Index(['Date', 'Product', 'Complaint'], dtype='object')


In [11]:
%%capture
!pip install symspellpy

In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
from symspellpy import SymSpell, Verbosity
import pkg_resources
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

def init_symspell():
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    return sym_spell

def preprocess_text(text, sym_spell):
    if pd.isna(text):
        return text

    # Standardize bank name variations
    text = re.sub(r'Wells\s*\.*\s*Fargo|W\s*\.*\s*F\s*\.*', '', text, flags=re.IGNORECASE)

    # Remove numbers and special characters
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'XXX+', '', text)

    # Clean extra spaces
    text = ' '.join(text.split())

    # Spell correction
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    corrected_text = suggestions[0].term if suggestions else text

    return corrected_text

class ComplaintDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts.dropna().tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

def analyze_complaints_with_preprocessing(df, batch_size=16):
    sym_spell = init_symspell()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
    model.to(device)

  # Keep track of dates for non-NaN complaints
    dates = df[df['Complaint'].notna()]['Date']
    texts = df['Complaint'].apply(lambda x: preprocess_text(x, sym_spell) if pd.notna(x) else x)

    dataset = ComplaintDataset(texts, tokenizer, max_len=256)
    data_loader = DataLoader(dataset, batch_size=batch_size)

    predictions = []
    texts_processed = []

    model.eval()
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = F.softmax(outputs.logits, dim=1)
            predictions.extend(probs.cpu().numpy())
            texts_processed.extend(batch['text'])

    results_df = pd.DataFrame({
        'date': dates.values,
        'processed_text': texts_processed,
        'original_text': df['Complaint'].dropna().values,
        'probability_positive': [p[0] for p in predictions],
        'probability_negative': [p[1] for p in predictions],
        'probability_neutral': [p[2] for p in predictions],
        'sentiment_score': [p[0] - p[1] for p in predictions]
    })

    return results_df

# Usage
def run_analysis(df):
    results = analyze_complaints_with_preprocessing(df)
    print("\nSample Results:")
    print(results.head())

    print("\nSummary Statistics:")
    print(f"Average Sentiment Score: {results['sentiment_score'].mean():.3f}")
    print(f"Median Sentiment Score: {results['sentiment_score'].median():.3f}")

    # Count predominant sentiments
    sentiment_counts = results.apply(lambda x: 'positive' if x['sentiment_score'] > 0
                                   else 'negative' if x['sentiment_score'] < 0
                                   else 'neutral', axis=1).value_counts()
    print("\nSentiment Distribution:")
    print(sentiment_counts)

    return results

  import pkg_resources


In [14]:
if __name__ == "__main__":
    results = run_analysis(df)

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]


Sample Results:
        date                                     processed_text  \
0 2024-10-18  try to use my card and got declined and call t...   
1 2024-08-18  employees opened both personal and business ac...   
2 2024-08-18  my account was hacked several times and refuse...   
3 2024-10-18  i receive direct deposits from my job every fr...   
4 2024-08-16  i was exposed to a fraudulent activity that co...   

                                       original_text  probability_positive  \
0  Try to use my card and got declined and call t...              0.019030   
1  W. F employees opened both personal and busine...              0.012260   
2  My. Wells. Fargo. Acount. Was. Hacked. Several...              0.009603   
3  I receive direct deposits from my job every Fr...              0.027093   
4  I was exposed to a fradulent activity that cos...              0.022341   

   probability_negative  probability_neutral  sentiment_score  
0              0.533215             0.447755   

In [15]:
print(results.head())

        date                                     processed_text  \
0 2024-10-18  try to use my card and got declined and call t...   
1 2024-08-18  employees opened both personal and business ac...   
2 2024-08-18  my account was hacked several times and refuse...   
3 2024-10-18  i receive direct deposits from my job every fr...   
4 2024-08-16  i was exposed to a fraudulent activity that co...   

                                       original_text  probability_positive  \
0  Try to use my card and got declined and call t...              0.019030   
1  W. F employees opened both personal and busine...              0.012260   
2  My. Wells. Fargo. Acount. Was. Hacked. Several...              0.009603   
3  I receive direct deposits from my job every Fr...              0.027093   
4  I was exposed to a fradulent activity that cos...              0.022341   

   probability_negative  probability_neutral  sentiment_score  
0              0.533215             0.447755        -0.514185  


In [17]:
def classify_sentiment(score):
    if score > 0.2:
        return "Positive"
    elif score < -0.2:
        return "Negative"
    else:
        return "Neutral"

results['sentiment_classification'] = results['sentiment_score'].apply(classify_sentiment)
results['confidence'] = results[['probability_positive', 'probability_negative', 'probability_neutral']].max(axis=1)

In [18]:
results.to_csv('BERT_results.csv', index=True)