In [2]:
%%capture
!pip install symspellpy

In [3]:
%%capture
#!pip uninstall -y nltk
!pip install nltk

In [4]:
%%capture
!pip install textblob
!pip install vaderSentiment

In [12]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('vader_lexicon')
import pandas as pd
import numpy as np
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
import re
from symspellpy import SymSpell
import pkg_resources

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [13]:
def load_github_data(url):
    """
    Load data from GitHub raw content URL
    Example URL: https://raw.githubusercontent.com/samanthajmichael/machine_learning/main/data/complaints.csv
    """
    return pd.read_csv(url)

In [14]:
url = "https://raw.githubusercontent.com/samanthajmichael/machine_learning/main/data/complaints.csv"
df = pd.read_csv(url)

In [15]:
df = df.copy(deep=True)
df = df.rename(columns={'Date received': 'Date', 'Consumer complaint narrative': 'Complaint'})
df = df.loc[(df['Product']=='Bank account or service') | 
(df['Product']=='Checking or savings account') | 
(df['Product']=='Money transfers') | 
(df['Product']=='Money transfer, virtual currency, or money service')]
df=df[['Date', 'Product', 'Complaint']]
df = df.set_index(pd.to_datetime(df['Date'], format='mixed'))
df.drop(['Date'], axis=1, inplace=True)
df = df.dropna(subset=['Complaint'])

In [16]:
df.head()

Unnamed: 0_level_0,Product,Complaint
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-10-26,"Money transfer, virtual currency, or money ser...","On XX/XX/2022, I was contacted by XXXX XXXX ; ..."
2023-02-10,Checking or savings account,I had been banking with Wells Fargo since XXXX...
2024-10-31,Checking or savings account,From XXXX until XXXXXXXX XXXX XXXX someone had...
2023-02-02,Checking or savings account,Several years ago opened an additional savings...
2023-03-01,Checking or savings account,I stopped using my wells Fargo account because...


In [17]:
print("Data shape:", df.shape)

Data shape: (20347, 2)


In [None]:
def initialize_symspell():
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    return sym_spell

def preprocess_text(text, sym_spell):
    try:
        text = str(text).lower()
        # Remove Wells Fargo mentions
        text = re.sub(r'well?s?\s*f[a-z]*go|wf\b', '', text, flags=re.IGNORECASE)

        # Basic cleaning
        text = re.sub(r'[^a-z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()

        # Spell correction
        suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
        if suggestions:
            text = suggestions[0].term

   # Tokenization and stop word removal - decided against lemmatization since it can affect word intensity
        tokens = word_tokenize(text)
        return ' '.join(token for token in tokens if token not in stopwords.words('english'))
    except Exception as e:
        print(f"Error processing text: {e}")
        return ""

def calculate_sentiment_scores(text):
    blob = TextBlob(text)
    vader_scores = SentimentIntensityAnalyzer().polarity_scores(text)

    return {
        'textblob_polarity': blob.sentiment.polarity,
        'textblob_subjectivity': blob.sentiment.subjectivity,
        **vader_scores
    }

def analyze_complaints(input_df, text_column='Complaint'):
    df = input_df[input_df[text_column].notna() & (input_df[text_column].str.strip() != '')].copy()
    # Sort by date index
    df.sort_index(inplace=True)
    print("Preprocessing text...")
    df['cleaned_text'] = df[text_column].apply(lambda x: preprocess_text(x, initialize_symspell()))

    print("Calculating sentiment scores...")
    sentiments = df['cleaned_text'].apply(calculate_sentiment_scores)

    sentiment_columns = ['textblob_polarity', 'textblob_subjectivity', 'nltk_si_neg', 'nltk_si_neu', 'nltk_si_pos', 'nltk_si_compound']
    for col in sentiment_columns:
        df[col] = sentiments.apply(lambda x: x.get(col, 0))

    df.to_csv('results/sentiment_scores_raw.csv')

    agg_metrics = {
        'textblob_polarity': ['mean', 'std'],
        'textblob_subjectivity': ['mean', 'std'],
        'nltk_si_compound': ['mean', 'std'],
        'nltk_si_pos': 'mean',
        'nltk_si_neg': 'mean',
        'nltk_si_neu': 'mean',
        text_column: 'count'
    }

    daily_agg = df.groupby(df.index.date).agg(agg_metrics).round(4)
    daily_agg.to_csv('results/sentiment_scores_daily.csv')

    monthly_agg = df.groupby(df.index.to_period('M')).agg(agg_metrics).round(4)
    monthly_agg.to_csv('results/sentiment_scores_monthly.csv')

    return df, daily_agg, monthly_agg

In [None]:
raw_scores, daily_sentiment, monthly_sentiment = analyze_complaints(df)

Preprocessing text...
