In [6]:
# Import packages for LDA 
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import CoherenceModel, LdaMulticore
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings
from config import BG3_cleaned_data_file_path

warnings.filterwarnings('ignore')

In [7]:
# Function to download NLTK data if not already present
def download_nltk_data():
    datasets = [
        'vader_lexicon',
        'stopwords',
        'punkt',
        'wordnet',
        'omw-1.4',
        'averaged_perceptron_tagger'
    ]
    
    for dataset in datasets:
        try:
            nltk.data.find(f'corpora/{dataset}')
        except LookupError:
            nltk.download(dataset, quiet=True)

download_nltk_data()

# Load data file
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

# Extract and process author information
def process_author_info(df):
    df['author'] = df['author'].apply(ast.literal_eval)
    for key in ['steamid', 'num_games_owned', 'num_reviews', 'playtime_forever', 'playtime_at_review']:
        df[key] = df['author'].apply(lambda x: x.get(key, None))
    df['playtime_forever'] /= 60
    df['playtime_at_review'] /= 60
    df['timestamp_created_date'] = pd.to_datetime(df['timestamp_created'], unit='s').dt.strftime('%Y-%m-%d')
    return df

# Preprocess reviews
def preprocess_reviews(df):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    def clean_review(text):
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'[^\w\s]|_|-', ' ', text)
        text = re.sub(r'[^a-z\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        tokens = word_tokenize(text)
        tokens = word_tokenize(text.lower())
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 1]
        return ' '.join(tokens)
    
    df['processed_review'] = df['review'].apply(clean_review)
    return df

# Create term frequency and TF-IDF matrices
def create_vectorizers(df):
    count_vectorizer = CountVectorizer(min_df=0.1, max_df=0.95)
    tf_matrix = count_vectorizer.fit_transform(df['processed_review'])
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_review'])
    
    return count_vectorizer, tfidf_vectorizer, tf_matrix, tfidf_matrix

# Train LDA model
def train_lda_model(corpus, dictionary, texts, num_topics, alpha, eta, passes, chunksize):
    model = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, alpha=alpha, eta=eta, passes=passes, chunksize=chunksize)
    coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    return model, coherence


In [9]:
# Main script
if __name__ == '__main__':
    df = load_data(BG3_cleaned_data_file_path)
    
    count_vectorizer, tfidf_vectorizer, tf_matrix, tfidf_matrix = create_vectorizers(df)
    
    texts = [review.split() for review in df['processed_review']]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    num_topics = 11
    alpha = 0.5
    eta = 0.05
    passes = 200
    chunksize = 500
    
    model, coherence = train_lda_model(corpus, dictionary, texts, num_topics, alpha, eta, passes, chunksize)
    
    print(f'Coherence Score: {coherence}')
    
    vis_data = gensimvis.prepare(model, corpus, dictionary)
    
pyLDAvis.display(vis_data)

KeyError: 'author'