In [None]:
import sys
sys.path.append('../scripts')
from data_loader import DataLoader
from thematic_analysis import TextPreprocessor, KeywordExtractor, TopicModeler
from preprocessor import TextPreprocessor
from theme_extraction import ThemeExtractor


In [None]:
data_loader = DataLoader()

In [None]:
df = data_loader.load_csv('../data/processed/reviews_with_sentiment.csv')
df.head(5)

In [None]:
preprocessor = TextPreprocessor()

Clean the reviews by lowercase, remove punctuation, normalize spaces

In [None]:
df["processed_text"] = df["review"].apply(preprocessor.clean_text)
df.head()

Lemmatize - remove stopwords, reduce words to base form

In [None]:
df["lemmatized_text"] = df["processed_text"].apply(preprocessor.lemmatize)
df.head(5)

### Keyword Extraction

In [None]:
keyword_extractor = KeywordExtractor(max_features=300)

In [None]:
df["tfidf_text"] = df["lemmatized_text"]
top_keywords = keyword_extractor.extract(df.rename(columns={"tfidf_text": "review"}))
top_keywords

#### Topic Modeling with LDA

In [None]:
lda_modeler = TopicModeler(num_topics=5, num_words=10)

In [None]:
lda_modeler.fit(df, text_col='lemmatized_text')


In [None]:
topics = lda_modeler.get_topics()
for tid, words in topics.items():
    print(f"Topic {tid+1}: {words}")

In [None]:
df = lda_modeler.assign_dominant_topic(df, text_col='lemmatized_text')
df.head()

### Theme Extraction

In [None]:
theme_assigner = ThemeExtractor()

In [None]:
df = theme_assigner.apply(df, text_col="lemmatized_text")
df[["review", "lemmatized_text", "identified_theme"]].head()

In [None]:
df_thematic_analysis = df[["review", "rating", "date", "bank", "sentiment_label", "sentiment_score", "identified_theme"]]
df_thematic_analysis.head()

In [None]:
data_loader.df = df_thematic_analysis
data_loader.save_csv("../data/processed/reviews_with_themes.csv")