In [1]:
import sys
sys.path.append('../scripts')
from data_loader import DataLoader
from thematic_analysis import TextPreprocessor, KeywordExtractor, TopicModeler
from preprocessor import TextPreprocessor
from theme_extraction import ThemeExtractor


In [2]:
data_loader = DataLoader()

In [3]:
df = data_loader.load_csv('../data/processed/reviews_with_sentiment.csv')
df.head(5)

Loaded 1140 rows from ../data/processed/reviews_with_sentiment.csv


Unnamed: 0,review,rating,date,bank,source,sentiment_label,sentiment_score
0,üôèüëç,5,2025-11-29,BOA,Google Play,negative,0.697056
1,Very Good,5,2025-11-28,BOA,Google Play,positive,0.999852
2,goof,5,2025-11-28,BOA,Google Play,negative,0.997528
3,good!,5,2025-11-28,BOA,Google Play,positive,0.999827
4,good jop,5,2025-11-27,BOA,Google Play,positive,0.999841


In [4]:
preprocessor = TextPreprocessor()

Clean the reviews by lowercase, remove punctuation, normalize spaces

In [5]:
df["processed_text"] = df["review"].apply(preprocessor.clean_text)
df.head()

Unnamed: 0,review,rating,date,bank,source,sentiment_label,sentiment_score,processed_text
0,üôèüëç,5,2025-11-29,BOA,Google Play,negative,0.697056,
1,Very Good,5,2025-11-28,BOA,Google Play,positive,0.999852,very good
2,goof,5,2025-11-28,BOA,Google Play,negative,0.997528,goof
3,good!,5,2025-11-28,BOA,Google Play,positive,0.999827,good
4,good jop,5,2025-11-27,BOA,Google Play,positive,0.999841,good jop


Lemmatize - remove stopwords, reduce words to base form

In [6]:
df["lemmatized_text"] = df["processed_text"].apply(preprocessor.lemmatize)
df.head(5)

Unnamed: 0,review,rating,date,bank,source,sentiment_label,sentiment_score,processed_text,lemmatized_text
0,üôèüëç,5,2025-11-29,BOA,Google Play,negative,0.697056,,
1,Very Good,5,2025-11-28,BOA,Google Play,positive,0.999852,very good,good
2,goof,5,2025-11-28,BOA,Google Play,negative,0.997528,goof,goof
3,good!,5,2025-11-28,BOA,Google Play,positive,0.999827,good,good
4,good jop,5,2025-11-27,BOA,Google Play,positive,0.999841,good jop,good jop


### Keyword Extraction

In [7]:
keyword_extractor = KeywordExtractor(max_features=300)

In [8]:
df["tfidf_text"] = df["lemmatized_text"]
top_keywords = keyword_extractor.extract(df.rename(columns={"tfidf_text": "review"}))
top_keywords

{'BOA': [('app', np.float64(0.09123354411848585)),
  ('good', np.float64(0.06084170953800131)),
  ('bank', np.float64(0.03242391776363455)),
  ('best', np.float64(0.030806276693396466)),
  ('boa', np.float64(0.028134514298315583)),
  ('working', np.float64(0.022600824832502087)),
  ('worst', np.float64(0.02212333663310338)),
  ('work', np.float64(0.021675935416588424)),
  ('great', np.float64(0.02136352068175704)),
  ('like', np.float64(0.020876992718328713)),
  ('mobile', np.float64(0.020850429039477363)),
  ('banking', np.float64(0.019320727734756182)),
  ('nice', np.float64(0.018754977133984222)),
  ('doesnt', np.float64(0.01871689182686144)),
  ('use', np.float64(0.017879646763609383))],
 'CBE': [('app', np.float64(0.08305661655144082)),
  ('good', np.float64(0.06484505049790137)),
  ('best', np.float64(0.040722923760380336)),
  ('cbe', np.float64(0.030509610036248767)),
  ('bank', np.float64(0.030212659658754977)),
  ('nice', np.float64(0.0263872497487395)),
  ('like', np.float64(

#### Topic Modeling with LDA

In [9]:
lda_modeler = TopicModeler(num_topics=5, num_words=10)

In [10]:
lda_modeler.fit(df, text_col='lemmatized_text')


<thematic_analysis.TopicModeler at 0x14e674c20>

In [11]:
topics = lda_modeler.get_topics()
for tid, words in topics.items():
    print(f"Topic {tid+1}: {words}")

Topic 1: ['app', 'banking', 'good', 'bank', 'mobile', 'dashen', 'super', 'great', 'use', 'application']
Topic 2: ['app', 'amazing', 'step', 'experience', 'ahead', 'poor', '·äê·ãç', '·ä†·à™·çç', 'appreciate', 'make']
Topic 3: ['nice', 'account', 'bank', 'application', 'app', 'smart', 'transaction', 'phone', 'update', 'number']
Topic 4: ['app', 'bank', 'bad', 'not', 'money', 'transfer', 'update', 'work', 'fix', 'transaction']
Topic 5: ['app', 'good', 'not', 'work', 'update', 'need', 'open', 'nice', 'slow', 'try']


In [12]:
df = lda_modeler.assign_dominant_topic(df, text_col='lemmatized_text')
df.head()

Unnamed: 0,review,rating,date,bank,source,sentiment_label,sentiment_score,processed_text,lemmatized_text,tfidf_text,tokens,dominant_topic
0,üôèüëç,5,2025-11-29,BOA,Google Play,negative,0.697056,,,,[],0
1,Very Good,5,2025-11-28,BOA,Google Play,positive,0.999852,very good,good,good,[good],4
2,goof,5,2025-11-28,BOA,Google Play,negative,0.997528,goof,goof,goof,[goof],1
3,good!,5,2025-11-28,BOA,Google Play,positive,0.999827,good,good,good,[good],4
4,good jop,5,2025-11-27,BOA,Google Play,positive,0.999841,good jop,good jop,good jop,"[good, jop]",3


### Theme Extraction

In [13]:
theme_assigner = ThemeExtractor()

In [14]:
df = theme_assigner.apply(df, text_col="lemmatized_text")
df[["review", "lemmatized_text", "identified_theme"]].head()

Unnamed: 0,review,lemmatized_text,identified_theme
0,üôèüëç,,[Other]
1,Very Good,good,[User Interface & Experience]
2,goof,goof,[Other]
3,good!,good,[User Interface & Experience]
4,good jop,good jop,[User Interface & Experience]


In [19]:
df_thematic_analysis = df[["review", "rating", "date", "bank", "sentiment_label", "sentiment_score", "source","identified_theme"]]
df_thematic_analysis.head()

Unnamed: 0,review,rating,date,bank,sentiment_label,sentiment_score,source,identified_theme
0,üôèüëç,5,2025-11-29,BOA,negative,0.697056,Google Play,[Other]
1,Very Good,5,2025-11-28,BOA,positive,0.999852,Google Play,[User Interface & Experience]
2,goof,5,2025-11-28,BOA,negative,0.997528,Google Play,[Other]
3,good!,5,2025-11-28,BOA,positive,0.999827,Google Play,[User Interface & Experience]
4,good jop,5,2025-11-27,BOA,positive,0.999841,Google Play,[User Interface & Experience]


In [20]:
data_loader.df = df_thematic_analysis
data_loader.save_csv("../data/processed/reviews_with_themes.csv")

Saved 1140 rows to ../data/processed/reviews_with_themes.csv
