# Preprocessing

In [3]:
import pandas as pd
import re
import spacy

# Load the English spaCy model
nlp = spacy.load("en_core_web_sm")

# Load your reviews CSV (already labeled with sentiment)
df = pd.read_csv("../data/processed/bank_reviews_sentiment.csv")

# Define the preprocessing function
def preprocess(text):
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", str(text))  # make sure input is a string
    text = text.lower()
    doc = nlp(text)
    
    # Lemmatize and remove stopwords/punctuation
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    return " ".join(tokens)

# Apply the preprocessing to the 'review' column
df["cleaned_review"] = df["review"].astype(str).apply(preprocess)

# Preview the result
print(df[["review", "cleaned_review"]].head())


                                              review  \
0  "Why don’t your ATMs support account-to-accoun...   
1                        what is this app problem???   
2       the app is proactive and a good connections.   
3    I cannot send to cbebirr app. through this app.   
4                                               good   

                                      cleaned_review  
0  not atms support accounttoaccount transfer lik...  
1                                        app problem  
2                      app proactive good connection  
3                               send cbebirr app app  
4                                               good  


# TF-IDF for Keyword & N-gram Extraction

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_keywords_by_bank(df, bank_name, max_features=500, top_n=500):
    # Filter reviews for the bank
    bank_reviews = df[df['bank'] == bank_name]['review'].tolist()
    
    # TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, 2))
    X = vectorizer.fit_transform(bank_reviews)
    
    # Sum TF-IDF scores for each term
    tfidf_sum = X.sum(axis=0).A1
    
    # Get feature names
    keywords = vectorizer.get_feature_names_out()
    
    # Build DataFrame and sort
    tfidf_df = pd.DataFrame({'keyword': keywords, 'tfidf': tfidf_sum})
    top_keywords = tfidf_df.sort_values(by='tfidf', ascending=False).head(top_n)
    return top_keywords

# Example:
dashen_keywords = extract_keywords_by_bank(df, "Dashen")
boa_keywords = extract_keywords_by_bank(df, "BOA")
cbe_keywords = extract_keywords_by_bank(df, "CBE")

print("Dashen Bank keywords:\n", dashen_keywords)
print("\nBank of Abyssinia keywords:\n", boa_keywords)
print("\nCBE keywords:\n", cbe_keywords)


Dashen Bank keywords:
                keyword      tfidf
37                 app  31.360228
193               good  27.679681
243                 it  24.784314
72                best  23.829145
23                 and  22.620957
..                 ...        ...
445              users   0.443058
349  security features   0.428857
449           ve never   0.422299
56            are able   0.421556
3               access   0.390167

[500 rows x 2 columns]

Bank of Abyssinia keywords:
           keyword      tfidf
173          good  42.146188
24            app  28.697719
210            it  24.925990
371           the  19.607729
73           best  15.128121
..            ...        ...
113    decided to   0.318067
112       decided   0.318067
55        balance   0.293540
313  options even   0.285103
300        on the   0.229359

[500 rows x 2 columns]

CBE keywords:
               keyword      tfidf
163              good  55.977649
31                app  31.093167
67               best  23.65

# Group Related Keywords

In [30]:

theme_keywords = {
  "Account Access Issues": [
        "access", "access account", "account", "activate", "active", "login", "log", "password", "pin", "otp", "authenticate", "unable", "fail"
    ],

    "Transaction & Payment": [
        "transfer", "transfer money", "transfer bank", "send", "send money", "payment", "pay", "pay bill", "bill payment",
        "money", "balance", "bank statement", "receipt", "recent transaction", "transaction", "fast transaction", "slow", "delay", "pending", "cancel"
    ],

    "App Performance & Stability": [
        "app crash", "crash", "bug", "error", "fail", "problem", "problem app", "slow", "loading", "load", "not work", "work properly", "working",
        "unreliable", "disable", "turn developer", "developer mode", "developer option"
    ],

    "User Interface & Experience": [
        "ui", "user interface", "user experience", "user friendly", "userfriendly", "interface clean", "navigation", "design", "layout",
        "screen", "screen shot", "screenshot", "simple", "easy use", "friendly", "intuitive", "responsive"
    ],

    "Customer Support & Service": [
        "support", "customer service", "help", "helpful", "response", "respond", "contact", "call", "agent", "chat", "service app"
    ],

    "Banking & Financial Services": [
        "bank", "bank app", "bank dashen", "bank ethiopia", "bank mobile", "banking", "banking app", "banking application",
        "digital banking", "finance", "financial", "budget", "expense", "investment", "industry", "marketplace", "merchant"
    ],

    "Security & Privacy": [
        "secure", "security", "security feature", "safe", "biometric", "authenticate"
    ],

    "Features & Requests": [
        "feature", "add", "update", "upgrade", "include", "improve", "improvement", "new update", "option", "customize", "request"
    ],

    "General Sentiment & Other": [
        "good", "great", "amazing", "awesome", "excellent", "fantastic", "bad", "disappointing", "frustrating", "poor", "horrible", "happy",
        "love", "like", "hate", "annoying", "sad", "wow"
    ]
}

def identify_themes(keyword_list):
    matched_themes = set()
    for keyword in keyword_list:
        for theme, theme_words in theme_keywords.items():
            if keyword in theme_words:
                matched_themes.add(theme)
    return list(matched_themes) if matched_themes else ["Other"]

df["identified_themes"] = df["keywords"].apply(identify_themes)
df.tail(100)

Unnamed: 0,review,rating,date,bank,source,label,score,cleaned_review,keywords,identified_themes
1248,"I tried the new Dash Bank Super App today, and...",5,2025-02-06,Dashen,Google Play,POSITIVE,0.999851,try new dash bank super app today amazingly ea...,"[download, recommend, bank super, new, try]",[Other]
1249,Gud app kegza ga mezmn endze new aind ermjh ke...,5,2025-02-06,Dashen,Google Play,NEGATIVE,0.985222,gud app kegza ga mezmn endze new aind ermjh ke...,"[new, app]",[Other]
1250,"Good app, bad security",3,2025-02-06,Dashen,Google Play,POSITIVE,0.989301,good app bad security,"[security, bad, good app, good, app]","[Security & Privacy, General Sentiment & Other]"
1251,On Transaction page only show sent Transaction...,4,2025-02-06,Dashen,Google Play,NEGATIVE,0.995696,transaction page send transaction try include ...,"[transaction, receive, include, send, try]","[Features & Requests, Transaction & Payment]"
1252,Wonderful Application 😍,5,2025-02-05,Dashen,Google Play,POSITIVE,0.999859,wonderful application,"[wonderful, application]",[Other]
...,...,...,...,...,...,...,...,...,...,...
1343,Best,5,2025-01-13,Dashen,Google Play,POSITIVE,0.999794,well,[well],[Other]
1344,"Waw Great and innovated,user friendly, always ...",5,2025-01-13,Dashen,Google Play,POSITIVE,0.999779,waw great innovateduser friendly step ahead,"[waw, friendly, step ahead, ahead, step]",[User Interface & Experience]
1345,It's Best waww 🙏,5,2025-01-13,Dashen,Google Play,POSITIVE,0.999744,good waww,[good],[General Sentiment & Other]
1346,Always one step ahead,5,2025-01-13,Dashen,Google Play,POSITIVE,0.999770,step ahead,"[step ahead, ahead, step]",[Other]


# Export CSV

In [31]:
# Save the DataFrame with identified themes
df.to_csv("../data/processed/bank_reviews_with_themes.csv", index=False)

# Identify each bank and keywords associated with it

In [None]:
# Step 1: Explode the 'identified_themes' column to make one theme per row
df_exploded = df.explode("identified_themes")

# Step 2: Group by 'bank' and 'identified_themes' and count occurrences
theme_counts = (
    df_exploded.groupby(["bank", "identified_themes"])
    .size()
    .reset_index(name="count")
)

# Step 3: Pivot the table: banks as rows, themes as columns
theme_summary_df = (
    theme_counts.pivot(index="bank", columns="identified_themes", values="count")
    .fillna(0)
    .astype(int)
    .reset_index()
)

# Step 4: Save the theme summary DataFrame for later use
theme_summary_df.to_csv("../data/processed/theme_summary_per_bank.csv", index=False)

# Optional: Display first few rows
print(theme_summary_df.head())


identified_themes    bank  Account Access Issues  App Performance & Stability  \
0                     BOA                     20                           59   
1                     CBE                      7                           24   
2                  Dashen                     18                           22   

identified_themes  Banking & Financial Services  Customer Support & Service  \
0                                            32                           7   
1                                            30                           8   
2                                            61                           7   

identified_themes  Features & Requests  General Sentiment & Other  Other  \
0                                   24                        145    192   
1                                   12                        190    174   
2                                   30                        139    165   

identified_themes  Security & Privacy  Transaction & 