In [None]:
%%capture
!pip install symspellpy

In [None]:
%%capture
#!pip uninstall -y nltk
!pip install nltk

In [None]:
%%capture
!pip install textblob
!pip install vaderSentiment

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.3/626.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: textblob
Successfully installed textblob-0.18.0.post0
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('vader_lexicon')
import pandas as pd
import numpy as np
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
import re
from symspellpy import SymSpell
import pkg_resources

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  import pkg_resources


In [None]:
def load_github_data(url):
    """
    Load data from GitHub raw content URL
    Example URL: https://raw.githubusercontent.com/samanthajmichael/machine_learning/main/data/complaints.csv
    """
    return pd.read_csv(url)

In [None]:
url = "https://raw.githubusercontent.com/samanthajmichael/machine_learning/main/data/complaints.csv"
df = pd.read_csv(url)

In [None]:
df = df.copy(deep=True)
df = df.rename(columns={'Date received': 'Date', 'Consumer complaint narrative': 'Complaint'})
df = df.loc[(df['Product']=='Bank account or service') | (df['Product']=='Checking or savings account') | (df['Product']=='Money transfers') | (df['Product']=='Money transfer, virtual currency, or money service')]
df=df[['Date', 'Product', 'Complaint']]
df = df.set_index(pd.to_datetime(df['Date'], format='mixed'))
df.drop(['Date'], axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0_level_0,Product,Complaint
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-10-19,Checking or savings account,
2024-10-18,Checking or savings account,Try to use my card and got declined and call t...
2024-08-18,Checking or savings account,W. F employees opened both personal and busine...
2024-08-18,Checking or savings account,My. Wells. Fargo. Acount. Was. Hacked. Several...
2024-10-18,Checking or savings account,I receive direct deposits from my job every Fr...


In [None]:
def initialize_symspell():
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    return sym_spell

def preprocess_text(text, sym_spell):
    try:
        text = str(text).lower()
        # Remove Wells Fargo mentions
        text = re.sub(r'well?s?\s*f[a-z]*go|wf\b', '', text, flags=re.IGNORECASE)

        # Basic cleaning
        text = re.sub(r'[^a-z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()

        # Spell correction
        suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
        if suggestions:
            text = suggestions[0].term

   # Tokenization and stop word removal - decided against lemmatization since it can affect word intensity
        tokens = word_tokenize(text)
        return ' '.join(token for token in tokens if token not in stopwords.words('english'))
    except Exception as e:
        print(f"Error processing text: {e}")
        return ""

def calculate_sentiment_scores(text):
    blob = TextBlob(text)
    vader_scores = SentimentIntensityAnalyzer().polarity_scores(text)

    return {
        'textblob_polarity': blob.sentiment.polarity,
        'textblob_subjectivity': blob.sentiment.subjectivity,
        **vader_scores
    }

def analyze_complaints(input_df, text_column='Complaint'):
    df = input_df[input_df[text_column].notna() & (input_df[text_column].str.strip() != '')].copy()
    # Sort by date index
    df.sort_index(inplace=True)
    print("Preprocessing text...")
    df['cleaned_text'] = df[text_column].apply(lambda x: preprocess_text(x, initialize_symspell()))

    print("Calculating sentiment scores...")
    sentiments = df['cleaned_text'].apply(calculate_sentiment_scores)

    sentiment_columns = ['textblob_polarity', 'textblob_subjectivity', 'nltk_si_neg', 'nltk_si_neu', 'nltk_si_pos', 'nltk_si_compound']
    for col in sentiment_columns:
        df[col] = sentiments.apply(lambda x: x.get(col, 0))

    df.to_csv('sentiment_scores_raw.csv')

    agg_metrics = {
        'textblob_polarity': ['mean', 'std'],
        'textblob_subjectivity': ['mean', 'std'],
        'nltk_si_compound': ['mean', 'std'],
        'nltk_si_pos': 'mean',
        'nltk_si_neg': 'mean',
        'nltk_si_neu': 'mean',
        text_column: 'count'
    }

    daily_agg = df.groupby(df.index.date).agg(agg_metrics).round(4)
    daily_agg.to_csv('sentiment_scores_daily.csv')

    monthly_agg = df.groupby(df.index.to_period('M')).agg(agg_metrics).round(4)
    monthly_agg.to_csv('sentiment_scores_monthly.csv')

    return df, daily_agg, monthly_agg

In [None]:
raw_scores, daily_sentiment, monthly_sentiment = analyze_complaints(df)

Preprocessing text...
