In [None]:
# import necessary libraries
from transformers import pipeline
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer


# load cleaned reviews dataset
df = pd.read_csv('../../data/clean_reviews.csv')


classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def classify_review(text):
    result = classifier(text[:512])[0]  # Truncate long reviews
    return pd.Series([result['label'], result['score']])

df[['sentiment_label', 'sentiment_score']] = df['review'].apply(classify_review)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [8]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
def clean_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(tokens)

df['cleaned_review'] = df['review'].apply(clean_text)

In [4]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=50)
X = vectorizer.fit_transform(df['cleaned_review'])
keywords = vectorizer.get_feature_names_out()
print(keywords)

['account' 'amazing' 'amole' 'app' 'application' 'bad' 'bank' 'banking'
 'banking app' 'boa' 'cbe' 'crash' 'dashen' 'developer' 'easy' 'easy use'
 'ethiopia' 'excellent' 'fast' 'fix' 'good' 'good app' 'great' 'like'
 'mobile' 'mobile banking' 'money' 'need' 'nice' 'nice app' 'ok' 'open'
 'option' 'problem' 'screenshot' 'send' 'service' 'simple' 'slow' 'thank'
 'time' 'transaction' 'transfer' 'try' 'update' 'use' 'user' 'well' 'work'
 'wow']


In [None]:
theme_map = {
    'login': 'Account Access',
    'error': 'Account Access',
    'crash': 'Stability',
    'support': 'Customer Support',
    'transfer': 'Transactions',
    'interface': 'User Experience',
    'update': 'App Performance',
    'fast': 'Speed'
}

def map_theme(text):
    found = set()
    for word in theme_map:
        if word in text.lower():
            found.add(theme_map[word])
    return list(found) if found else ['Miscellaneous']

df['identified_themes'] = df['cleaned_review'].apply(map_theme)

df_final = df[['review', 'rating', 'bank', 'sentiment_label', 'sentiment_score', 'identified_themes']]
df_final.to_csv('../../data/sentiment_themes_output.csv', index=False)

# Average sentiment by bank and rating
agg = df.groupby(['bank', 'rating'])['sentiment_score'].mean().reset_index()
agg.to_csv("../../data/aggregated_sentiment_by_bank_rating.csv", index=False)


