# Los Angeles LDA Training

In [2]:
#LDA for single paragraph tweet per day
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
import re

#TRAINING LDA MODEL USING THE LDA TRAINING DATASET
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('la_train_lda.csv')

# Select the first 100 tweets
tweets = df['content']

# Define functions for preprocessing text
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    
    # Tokenize text and remove stop words
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    words = word_tokenize(text.lower())
    words_pos = pos_tag(words)
    words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in words_pos if tag.startswith('J') or tag.startswith('V') or tag.startswith('N') or tag.startswith('R')]
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return words

# Preprocess the tweets
texts = [preprocess_text(tweet) for tweet in tweets]

# Remove empty lists from the texts
texts = [text for text in texts if text]


# Create a dictionary and corpus for the LDA model
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
token2id = dictionary.token2id

In [None]:
# Run the LDA model
num_topics = 10
passes = 10
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes)

# Print the top 10 words for each topic
for i, topic in lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False):
    print(f'Topic {i+1}: {" ".join([word for word, _ in topic])}')

In [None]:
#inference on training dataset
train_inference = lda_model.inference(corpus)
#Create new dataframe with date and values of LDA
df_dates = df['date']
#LDA data
data= train_inference[0]
#Label the columns
df_data = pd.DataFrame(data, columns=['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5',
                                      'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 10'])

# Concatenate the date and data DataFrames
df_result = pd.concat([df_dates, df_data], axis=1)

# Export the result to a CSV file
df_result.to_csv('training_data_final.csv', index=False)

## Los Angeles LDA Testing Dataset Inference

In [None]:
df = pd.read_csv('la_test_lda.csv')

# Select the first 100 tweets
tweets = df['content']


# Preprocess the tweets
texts = [preprocess_text(tweet) for tweet in tweets]

# Remove empty lists from the texts
texts = [text for text in texts if text]


# Create a dictionary and corpus for the LDA model
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
token2id = dictionary.token2id

#Inference testing dataset
inference = lda_model.inference(corpus)
#Create new dataframe with date and values of LDA
df_dates = df['date']
data= inference[0]

#Label the columns
df_data = pd.DataFrame(data, columns=['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5',
                                      'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 10'])

# Concatenate the date and data DataFrames
df_result = pd.concat([df_dates, df_data], axis=1)

# Export the result to a CSV file
df_result.to_csv('testing_data_final.csv', index=False)

# Boston LDA Training

In [None]:
#TRAINING LDA MODEL USING THE LDA TRAINING DATASET
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('b_train_lda.csv')

# Select the first 100 tweets
tweets = df['content']

# Preprocess the tweets
texts = [preprocess_text(tweet) for tweet in tweets]

# Remove empty lists from the texts
texts = [text for text in texts if text]


# Create a dictionary and corpus for the LDA model
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
token2id = dictionary.token2id

In [None]:
# Run the LDA model
num_topics = 10
passes = 10
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes)

#inference on training dataset
train_inference = lda_model.inference(corpus)

df_dates = df['date']
data= train_inference[0]


df_data = pd.DataFrame(data, columns=['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5',
                                      'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 10'])

# Concatenate the date and data DataFrames
df_result = pd.concat([df_dates, df_data], axis=1)

# Export the result to a CSV file
df_result.to_csv('b_training_lda_final.csv', index=False)

In [None]:
#Inference for testing dataset and export 
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('b_test_lda.csv')

# Select the first 100 tweets
tweets = df['content']


# Preprocess the tweets
texts = [preprocess_text(tweet) for tweet in tweets]

# Remove empty lists from the texts
texts = [text for text in texts if text]


# Create a dictionary and corpus for the LDA model
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
token2id = dictionary.token2id

In [None]:
#Inference 
inference = lda_model.inference(corpus)
#export
df_dates = df['date']
data= inference[0]

df_data = pd.DataFrame(data, columns=['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5',
                                      'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 10'])

# Concatenate the date and data DataFrames
df_result = pd.concat([df_dates, df_data], axis=1)

# Export the result to a CSV file
df_result.to_csv('b_testing_lda_final.csv', index=False)