In [54]:
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from transformers import pipeline

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alche\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Topic Modeling

In [55]:
df = pd.read_parquet("datasets/cleaned_datasets/final_dataset_philly.parquet")

# print(df.head())
# print(df.columns)


In [56]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['quarter'] = df['date'].dt.to_period('Q')


In [57]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)


In [58]:
df['tokens'] = df['clean_text'].apply(lambda x: [word for word in x.split() if word not in stop_words and len(word) > 2])
print(df[['review_id', 'clean_text', 'tokens']].head())


                review_id                                         clean_text  \
0  J1LZjzbs5bFubvS135SD2g  had a great big meal with family and we loved ...   
1  ecMiAOFucDM3zwXYfY-Q6A  many locations all have lines so be prepared t...   
2  yuFQRhHo3z4TgE6drPXSgg  compliments to the chef and to the rest of the...   
3  Zdh0_HtE724MnohLOrB5Iw  i decided to try this spot out and it didnt di...   
4  y_XYEZk2Cin-q4N0czeaYw  first off finding parking is atrocious your be...   

                                              tokens  
0  [great, big, meal, family, loved, got, lot, ap...  
1  [many, locations, lines, prepared, wait, well,...  
2  [compliments, chef, rest, staff, hosting, lab,...  
3  [decided, try, spot, didnt, disappoint, starte...  
4  [first, finding, parking, atrocious, best, bet...  


In [63]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=5, max_features=10000, ngram_range=(1,2))

In [68]:
generator = pipeline("text2text-generation", model="google/flan-t5-base")

Device set to use cpu


In [69]:
topics_by_year = {}
n_topics = 10

for year, group in df.groupby('year'):
    print(f"\nProcessing topics for year: {year}")
    texts = group['clean_text']
    dtm_year = vectorizer.fit_transform(texts)
    
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=10)
    lda.fit(dtm_year)
    
    feature_names = vectorizer.get_feature_names_out()
    topics = {}
    
    for idx, topic in enumerate(lda.components_):
        top_indices = topic.argsort()[-10:]
        top_words = [feature_names[i] for i in top_indices]
        topics[f"Topic {idx+1}"] = top_words
        
        prompt = (
            f"Given the topic words: {', '.join(top_words)},\n"
            "provide a concise one or two word label that best describes the topic."
        )
        
        response = generator(prompt, max_length=20, truncation=True, do_sample=False)[0]['generated_text']
        best_label = response.split('\n')[0].strip().strip(".")

        print(f"Year {year} - Topic {idx+1}: {top_words}")
        print(f"  Suggested label: {best_label}\n")
    
    topics_by_year[year] = topics


Processing topics for year: 2020
Year 2020 - Topic 1: ['they', 'money', 'if', 'this', 'the', 'is', 'your', 'not', 'dont', 'you']
  Suggested label: if

Year 2020 - Topic 2: ['with', 'car', 'in', 'they', 'he', 'my', 'was', 'to', 'the', 'and']
  Suggested label: he

Year 2020 - Topic 3: ['in', 'for', 'me', 'she', 'of', 'was', 'my', 'and', 'to', 'the']
  Suggested label: was

Year 2020 - Topic 4: ['place', 'are', 'for', 'of', 'food', 'to', 'great', 'is', 'and', 'the']
  Suggested label: great

Year 2020 - Topic 5: ['love', 'best', 'great', 'to', 'in', 'always', 'are', 'and', 'is', 'the']
  Suggested label: love

Year 2020 - Topic 6: ['were', 'for', 'but', 'with', 'to', 'of', 'it', 'was', 'and', 'the']
  Suggested label: was

Year 2020 - Topic 7: ['it', 'us', 'for', 'were', 'to', 'our', 'and', 'was', 'we', 'the']
  Suggested label: it

Year 2020 - Topic 8: ['no', 'it', 'me', 'my', 'was', 'they', 'and', 'order', 'to', 'the']
  Suggested label: it

Year 2020 - Topic 9: ['delivery', 'ive', '

In [61]:
for year, topics in topics_by_year.items():
    print(f"\nYear: {year}")
    for topic, words in topics.items():
        print(f"  {topic}: {words}")


Year: 2020
  Topic 1: ['they', 'money', 'if', 'this', 'the', 'is', 'your', 'not', 'dont', 'you']
  Topic 2: ['with', 'car', 'in', 'they', 'he', 'my', 'was', 'to', 'the', 'and']
  Topic 3: ['in', 'for', 'me', 'she', 'of', 'was', 'my', 'and', 'to', 'the']
  Topic 4: ['place', 'are', 'for', 'of', 'food', 'to', 'great', 'is', 'and', 'the']
  Topic 5: ['love', 'best', 'great', 'to', 'in', 'always', 'are', 'and', 'is', 'the']
  Topic 6: ['were', 'for', 'but', 'with', 'to', 'of', 'it', 'was', 'and', 'the']
  Topic 7: ['it', 'us', 'for', 'were', 'to', 'our', 'and', 'was', 'we', 'the']
  Topic 8: ['no', 'it', 'me', 'my', 'was', 'they', 'and', 'order', 'to', 'the']
  Topic 9: ['delivery', 'ive', 'delicious', 'is', 'the best', 'chicken', 'food', 'best', 'and', 'the']
  Topic 10: ['color and', 'hair cut', 'happier', 'be happier', 'color', 'haircut', 'cut', 'salon', 'my hair', 'hair']

Year: 2021
  Topic 1: ['sugar', 'milk tea', 'banh mi', 'bubble tea', 'bubble', 'banh', 'milk', 'boba', 'mi', 'tea