#Import

In [1]:
# Imports
import pandas as pd
import spacy
import re
from collections import Counter
from tabulate import tabulate

# Load spaCy model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 50)

#Load data

In [2]:
def load_data(file_path='reviews_with_sentiment.csv'):
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded {len(df)} reviews from {file_path}")
        print("Unique banks:", df['bank'].unique())
        print("Reviews per bank:")
        display(df['bank'].value_counts().to_frame())
        return df
    except FileNotFoundError:
        print(f"Error: {file_path} not found")
        return None

df = load_data()
if df is None:
    raise SystemExit("Failed to load data")

Loaded 1200 reviews from reviews_with_sentiment.csv
Unique banks: ['Bank of Abyssinia' 'Commercial Bank of Ethiopia' 'Dashen Bank']
Reviews per bank:


Unnamed: 0_level_0,count
bank,Unnamed: 1_level_1
Bank of Abyssinia,400
Commercial Bank of Ethiopia,400
Dashen Bank,400


#Preprocessing Functions

In [3]:
def preprocess_text(text):
    if not isinstance(text, str) or pd.isna(text):
        return []
    text = re.sub(r'[^\w\s]', '', text.lower())
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and len(token.lemma_) > 2]
    return tokens

def extract_bigrams(text):
    if not isinstance(text, str) or pd.isna(text):
        return []
    doc = nlp(text.lower())
    bigrams = []
    for i in range(len(doc) - 1):
        if not doc[i].is_stop and not doc[i+1].is_stop and not doc[i].is_punct and not doc[i+1].is_punct:
            bigram = f"{doc[i].lemma_} {doc[i+1].lemma_}"
            bigrams.append(bigram)
    return bigrams

#Theme Definitions

In [4]:
theme_keywords = {
    'Account Access Issues': ['login', 'authentication', 'password', 'access', 'lock', 'error', 'fail'],
    'Transaction Performance': ['transfer', 'slow', 'delay', 'payment', 'transaction', 'processing', 'crash'],
    'User Interface & Experience': ['ui', 'interface', 'design', 'navigation', 'easy', 'friendly', 'confusing'],
    'Customer Support': ['support', 'help', 'response', 'service', 'contact', 'ticket'],
    'Feature Requests': ['feature', 'add', 'request', 'fingerprint', 'option', 'update']
}

def assign_themes(tokens, bigrams):
    themes = []
    bigram_text = ' ' + ' '.join(bigrams) + ' '
    for theme, keywords in theme_keywords.items():
        if any(keyword in tokens or f' {keyword} ' in bigram_text for keyword in keywords):
            themes.append(theme)
    return themes if themes else ['Other']

#Process Review

In [5]:
print("Preprocessing reviews...")
df['tokens'] = df['review'].apply(preprocess_text)
df['bigrams'] = df['review'].apply(extract_bigrams)
print("Assigning themes...")
df['themes'] = df.apply(lambda row: assign_themes(row['tokens'], row['bigrams']), axis=1)
print("Reviews per bank after processing:")
display(df['bank'].value_counts().to_frame())
print("Sample data with themes:")
display(df[['bank', 'review', 'themes']].head(10))

Preprocessing reviews...
Assigning themes...
Reviews per bank after processing:


Unnamed: 0_level_0,count
bank,Unnamed: 1_level_1
Bank of Abyssinia,400
Commercial Bank of Ethiopia,400
Dashen Bank,400


Sample data with themes:


Unnamed: 0,bank,review,themes
0,Bank of Abyssinia,This app is a joke. It crashes more than it wo...,"[Transaction Performance, Feature Requests]"
1,Bank of Abyssinia,"Hello, I‚Äôm facing a problem with the BOA Mobil...","[Account Access Issues, Transaction Performanc..."
2,Bank of Abyssinia,It keeps showing this pop up to turn off devel...,"[Transaction Performance, Feature Requests]"
3,Bank of Abyssinia,"Edit: New bug, app not letting me type in my o...",[Feature Requests]
4,Bank of Abyssinia,i entered incorrect security question by mista...,"[Account Access Issues, Customer Support, Feat..."
5,Bank of Abyssinia,I don't know what is wrong with BOA as a bank ...,"[Account Access Issues, Transaction Performanc..."
6,Bank of Abyssinia,What's wrong with App. this days? it doesn't w...,"[Transaction Performance, Feature Requests]"
7,Bank of Abyssinia,I‚Äôm giving this app one star because there are...,[Feature Requests]
8,Bank of Abyssinia,"I have a fitayah account, a type of interest f...",[Transaction Performance]
9,Bank of Abyssinia,Worst App ever. Totally unreliable. And it did...,[Other]


#Keyword Summary

In [7]:
keyword_summary = {}
for bank in df['bank'].unique():
    bank_reviews = df[df['bank'] == bank]
    all_tokens = [token for tokens in bank_reviews['tokens'] for token in tokens]
    all_bigrams = [bigram for bigrams in bank_reviews['bigrams'] for bigram in bigrams]
    top_tokens = Counter(all_tokens).most_common(10)
    top_bigrams = Counter(all_bigrams).most_common(10)
    keyword_summary[bank] = {'top_keywords': top_tokens, 'top_bigrams': top_bigrams}


with open('keyword_summary.txt', 'w', encoding='utf-8') as f:
    for bank, summary in keyword_summary.items():
        f.write(f"\n{bank}:\n")
        f.write("Top Keywords:\n")
        for keyword, count in summary['top_keywords']:
            f.write(f"  {keyword}: {count}\n")
        f.write("Top Bigrams:\n")
        for bigram, count in summary['top_bigrams']:
            f.write(f"  {bigram}: {count}\n")


print("Keyword summary saved to keyword_summary.txt")
print("\nSample keyword summary:")
for bank, summary in keyword_summary.items():
    print(f"\n{bank}:")
    print("Top Keywords:", summary['top_keywords'][:5])
    print("Top Bigrams:", summary['top_bigrams'][:5])

Keyword summary saved to keyword_summary.txt

Sample keyword summary:

Bank of Abyssinia:
Top Keywords: [('app', 356), ('not', 172), ('work', 129), ('bank', 82), ('update', 74)]
Top Bigrams: [('mobile banking', 42), ('developer option', 29), ('banking app', 28), ('bad app', 19), ('üëé üëé', 15)]

Commercial Bank of Ethiopia:
Top Keywords: [('app', 372), ('not', 194), ('transaction', 139), ('transfer', 112), ('work', 107)]
Top Bigrams: [('good app', 25), ('mobile banking', 22), ('developer option', 21), ('transfer money', 19), ('transaction history', 18)]

Dashen Bank:
Top Keywords: [('app', 221), ('dashen', 102), ('good', 92), ('bank', 78), ('super', 74)]
Top Bigrams: [('dashen bank', 57), ('super app', 46), ('dashen super', 22), ('good app', 21), ('mobile banking', 19)]


#Theme Distribution

In [8]:
print("\nTheme Distribution by Bank:")
for bank in df['bank'].unique():
    bank_themes = df[df['bank'] == bank]['themes'].explode().value_counts()
    print(f"\n{bank}:")
    display(bank_themes.reset_index().rename(columns={'index': 'Theme', 'themes': 'Count'}))


Theme Distribution by Bank:

Bank of Abyssinia:


Unnamed: 0,Count,count
0,Other,166
1,Feature Requests,108
2,Transaction Performance,105
3,Account Access Issues,51
4,User Interface & Experience,33
5,Customer Support,30



Commercial Bank of Ethiopia:


Unnamed: 0,Count,count
0,Transaction Performance,178
1,Feature Requests,159
2,Account Access Issues,83
3,Other,82
4,User Interface & Experience,64
5,Customer Support,58



Dashen Bank:


Unnamed: 0,Count,count
0,Other,251
1,User Interface & Experience,76
2,Transaction Performance,61
3,Feature Requests,58
4,Customer Support,28
5,Account Access Issues,12


#Save results

In [12]:

output_df = df[['review_id', 'bank', 'review', 'sentiment_label', 'sentiment_score', 'themes']]
output_df.to_csv('reviews_with_themes.csv', index=False)
print(f"Saved results to reviews_with_themes.csv ({len(output_df)} rows)")

# Verify saved file
saved_df = pd.read_csv('reviews_with_themes.csv')
print("\nReviews per bank in reviews_with_themes.csv:")
display(saved_df['bank'].value_counts().to_frame())

Saved results to reviews_with_themes.csv (1200 rows)

Reviews per bank in reviews_with_themes.csv:


Unnamed: 0_level_0,count
bank,Unnamed: 1_level_1
Bank of Abyssinia,400
Commercial Bank of Ethiopia,400
Dashen Bank,400
