In [1]:

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


df = pd.read_csv("Data/Processed_data/regular_transactions.csv")

def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^a-z\s]', ' ', text) 
    tokens = text.split()  
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens] 
    return ' '.join(tokens)

df['ProductName_clean'] = df['ProductName'].apply(preprocess_text)

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['ProductName_clean'])

# Fit LDA Model
n_topics = 5
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_model.fit(dtm)

# Assign topics to products
topic_assignments = lda_model.transform(dtm)  # Get topic distribution for each product
df['Topic'] = topic_assignments.argmax(axis=1)  # Assign most probable topic

# Function to display topics
def display_topics(model, feature_names, no_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        topics[topic_idx] = top_features
    return topics

# Get top words for each topic
no_top_words = 10
feature_names = vectorizer.get_feature_names_out()
topics = display_topics(lda_model, feature_names, no_top_words)

# Print topics
for topic_num, top_words in topics.items():
    print(f'Topic {topic_num}: {", ".join(top_words)}')

# Save output CSV
df.to_csv("Data/Processed_data/regular_transactions_with_topics.csv", index=False)

print("Topics assigned and saved to CSV.")

 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\punee\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Topic 0: set, box, red, retrospot, bottle, rose, hot, tea, water, heart
Topic 1: heart, christmas, light, lunch, hanging, holder, small, box, decoration, set
Topic 2: set, cake, vintage, tin, pack, case, design, red, dolly, girl
Topic 3: metal, sign, white, wooden, heart, frame, antique, silver, blue, doormat
Topic 4: bag, jumbo, red, polkadot, design, pink, card, retrospot, blue, set
Topics assigned and saved to CSV.
