In [None]:
# 1. Data Cleaning and Preprocessing
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Optional: Run once if needed
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

df = pd.read_csv("original reviews.csv")
df.columns = df.columns.str.strip().str.title()

lemmatizer = WordNetLemmatizer()
standard_stopwords = set(stopwords.words('english'))

custom_stopwords = set([...])  # use your full custom stopword list here

synonym_map = { ... }  # use your full synonym dictionary here

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in standard_stopwords]
    return ' '.join(words)

def remove_custom_stopwords(text):
    return ' '.join([word for word in text.split() if word not in custom_stopwords])

def normalize_synonyms(text):
    for phrase, replacement in synonym_map.items():
        text = re.sub(r'\b' + re.escape(phrase) + r'\b', replacement, text)
    return text

df['Cleaned_Review'] = df['Review'].apply(clean_text).apply(normalize_synonyms).apply(remove_custom_stopwords)
df.to_csv("mba_reviews_cleaned.csv", index=False)


In [None]:
# 2. Topic Modeling with BERTopic
from bertopic import BERTopic

df = pd.read_csv("mba_reviews_cleaned.csv")
df.columns = df.columns.str.strip()
docs = df['Cleaned_Review'].fillna('').tolist()

topic_model = BERTopic(language="english", top_n_words=10, verbose=True, min_topic_size=5, n_gram_range=(1, 2))
topics, _ = topic_model.fit_transform(docs)

df['BERTopic_Label'] = topics
df.to_csv("mba_reviews_with_topics.csv", index=False)


In [None]:
# 3. Merge & Rename Topics
def merge_topics(topic):
    if topic in [3, 5]:
        return '3_5'
    elif topic in [7, 9]:
        return '7_9'
    else:
        return str(topic)

df['Merged_Topic'] = df['BERTopic_Label'].apply(merge_topics)

topic_rename_map = {
    '0': 'finance',
    '1': 'community',
    '2': 'consulting',
    '3_5': 'class quality',
    '4': 'location',
    '6': 'professor',
    '7_9': 'entrepreneurship',
    '8': 'course difficulty'
}

counts = df.groupby(['School', 'Merged_Topic']).size().unstack(fill_value=0)
percent = counts.div(counts.sum(axis=1), axis=0)

counts_named = counts.rename(columns=topic_rename_map)
percent_named = percent.rename(columns=topic_rename_map)

counts_named.to_csv("school_topic_counts_named.csv")
percent_named.to_csv("school_topic_percent_named.csv")


In [None]:
# 4. Heatmap Visualization
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("school_topic_percent_named.csv")
df.set_index("School", inplace=True)

plt.figure(figsize=(14, 6))
sns.set(style="whitegrid")
sns.heatmap(df, annot=True, cmap="YlGnBu", fmt=".2f", linewidths=0.5, cbar_kws={'label': 'Proportion'})
plt.title("Topic Heatmap by School", fontsize=20)
plt.xticks(rotation=45, ha="right", fontsize=16)
plt.yticks(fontsize=16)
plt.tight_layout()
plt.show()
