## References

In [None]:
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
# https://radimrehurek.com/gensim/models/ldamodel.html

# Notebook Setup

In [None]:
# Import libraries
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
from gensim import corpora, models

In [None]:
# Log events
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load Test Documents

In [None]:
# Load district documents for first objective
with open("../outputs/tokenized_documents_district_pooling.p", "rb") as fp:
    district_pooling_docs = pickle.load(fp)

# Load month documents for second objective
# this one is not used
with open("../outputs/tokenized_documents_month_pooling.p", "rb") as fp:
    month_pooling_docs = pickle.load(fp)

# Load month and district documents for second objective
with open("../outputs/tokenized_documents_district_per_month_pooling.p", "rb") as fp:
    district_per_month_pooling_docs = pickle.load(fp)

## Load Model and Dictionary

In [None]:
# Load model
lda_model =  models.LdaModel.load("../outputs/lda_model_hashtag_pooling.model")

# load dictionary
if (os.path.exists("../outputs/tourism_hashtag_pooling.dict")):
   dictionary = corpora.Dictionary.load("../outputs/tourism_hashtag_pooling.dict")
   print("Hashtag pooling dictionary loaded!")
else:
   print("Please train LDA model first!")

## Running Model on Test Documents

In [None]:
# Apply model to district pooled documents and get topic probability distributions
district_bow_list = [dictionary.doc2bow(text) for text in district_pooling_docs]
# district_1 = district_bow_list[0] # example code
district_topic_list = []

for index in range(len(district_bow_list)):
    district_bow = district_bow_list[index]
    topic_vector = lda_model[district_bow] # get topic probability distribution for a document
    district_topic_list.append(topic_vector)

In [None]:
# Display district topic list
district_topic_list

## Extract for Each Topic the Scores of the Districts

In [None]:
# Initialize topic lists
topic0 = []
topic1 = []
topic2 = []
topic3 = []
topic4 = []
topic5 = []
topic6 = []

# Save results in topic lists
for district in range(len(district_topic_list)):
    district_dict = dict(district_topic_list[district])
    
    if 0 in district_dict.keys():
        topic0.append(district_dict[0])
    else:
        topic0.append(0)
    
    if 1 in district_dict.keys():
        topic1.append(district_dict[1])
    else:
        topic1.append(0)
    
    if 2 in district_dict.keys():
        topic2.append(district_dict[2])
    else:
        topic2.append(0)
        
    if 3 in district_dict.keys():
        topic3.append(district_dict[3])
    else:
        topic3.append(0)
        
    if 4 in district_dict.keys():
        topic4.append(district_dict[4])
    else:
        topic4.append(0)
        
    if 5 in district_dict.keys():
        topic5.append(district_dict[5])
    else:
        topic5.append(0)
    
    if 6 in district_dict.keys():
        topic6.append(district_dict[6])
    else:
        topic6.append(0)

In [None]:
# Check topic 0 to see whether the output looks as desired
topic0

In [None]:
# Define function to plot a graph to compare districts for a given topic
def district_bar_graph(districts, indices):
    """
    Function to plot district bar graph.
    
    districts: list with topic scores for each district
    indices: list of district number
    """
    assert len(districts) == len(indices)
    n = len(districts)
    x = np.arange(n)
    plt.figure(figsize=(10,7))
    plt.bar(x, districts, width=0.5, tick_label=indices, align="center")
    plt.xlabel("District")
    plt.ylabel("Topic Score")

## Topic 0: Sports, Health & Image

In [None]:
# Plot topic scores
district_bar_graph(topic0,
                   list(range(len(topic0))))
plt.show()

## Topic 1: Lifestyle & Culture

In [None]:
# Plot topic scores
district_bar_graph(topic1,
                   list(range(len(topic1))))
plt.show()

## Topic 2: Nightlife

In [None]:
# Plot topic scores
district_bar_graph(topic2,
                   list(range(len(topic2))))
plt.show()

## Topic 3: Streetart

In [None]:
# Plot topic scores
district_bar_graph(topic3,
                   list(range(len(topic3))))
plt.show()

## Topic 4: Sightseeing

In [None]:
# Plot topic scores
district_bar_graph(topic4,
                   list(range(len(topic4))))
plt.show()

## Topic 5: Summer, Sun & Friends

In [None]:
# Plot topic scores
district_bar_graph(topic5,
                   list(range(len(topic5))))
plt.show()

## Topic 6: Everyday Life

In [None]:
# Plot topic scores
district_bar_graph(topic6,
                   list(range(len(topic6))))
plt.show()

## Combine Results into a Dataframe

In [None]:
# Create districts dataframe
dictionary_districts = {"district": ["01","02","03","04","05","06","07","08","09","10"],
              "topic0": topic0,
              "topic1": topic1,
              "topic2": topic2,
              "topic3": topic3,
              "topic4": topic4,
              "topic5": topic5,
              "topic6": topic6}
districts = pd.DataFrame(dictionary_districts)

In [None]:
# Ignore this part

# display dataframe and set index
# districts.set_index("district")

## Save Results to a CSV File

In [None]:
# Ignore this part! (not necessary, is now done in the R script instead!)

# Ensure that excel will not delete leading 0s
# districts.district = districts.district.apply('"={}"'.format)
# districts.district = districts.district.apply(lambda x: str(x))

In [None]:
# Display dataframe
districts

In [None]:
districts.to_csv("districts.csv", encoding="utf-8")

## Next Part: Dynamic Analysis

In [None]:
# View district per month docs
district_per_month_pooling_docs

In [None]:
# Delete last 2 docs (december)
del(district_per_month_pooling_docs[-1])
del(district_per_month_pooling_docs[-1])
len(district_per_month_pooling_docs)

In [None]:
# Define function to apply the model to unseen documents (similar procedure to above where it was applied to the district docs)
def run_lda_on_test_doc(docs):
    bow_list = [dictionary.doc2bow(text) for text in docs]
    topic_list = []
    
    for index in range(len(bow_list)):
        bow = bow_list[index]
        topic_vec = lda_model[bow]
        topic_list.append(topic_vec)
        
    return topic_list

In [None]:
# Apply the model on dynamic data
# topics_month = run_lda_on_test_doc(month_pooling_docs)
topics_district_per_month = run_lda_on_test_doc(district_per_month_pooling_docs)

In [None]:
# Display district per month topic list
topics_district_per_month

In [None]:
# Initialize lists for topics
md_topic0 = []
md_topic1 = []
md_topic2 = []
md_topic3 = []
md_topic4 = []
md_topic5 = []
md_topic6 = []

# Store results in topic lists
for month_district in range(len(topics_district_per_month)):
    district_month_dict = dict(topics_district_per_month[month_district])
    
    if 0 in district_month_dict.keys():
        md_topic0.append(district_month_dict[0])
    else:
        md_topic0.append(0)
    
    if 1 in district_month_dict.keys():
        md_topic1.append(district_month_dict[1])
    else:
        md_topic1.append(0)
    
    if 2 in district_month_dict.keys():
        md_topic2.append(district_month_dict[2])
    else:
        md_topic2.append(0)
        
    if 3 in district_month_dict.keys():
        md_topic3.append(district_month_dict[3])
    else:
        md_topic3.append(0)
        
    if 4 in district_month_dict.keys():
        md_topic4.append(district_month_dict[4])
    else:
        md_topic4.append(0)
        
    if 5 in district_month_dict.keys():
        md_topic5.append(district_month_dict[5])
    else:
        md_topic5.append(0)
    
    if 6 in district_month_dict.keys():
        md_topic6.append(district_month_dict[6])
    else:
        md_topic6.append(0)

In [None]:
# Prepare lists with 'index'
district_list = ["01","02","03","04","05","06","07","08","09","10"] * 6
month_list = ["06"] * 10 + ["07"] * 10 + ["08"] * 10 + ["09"] * 10 + ["10"] * 10 + ["11"] * 10

In [None]:
# Create districts per month dataframe
md_dictionary = {"district": district_list,
              "month": month_list,
              "topic0": md_topic0,
              "topic1": md_topic1,
              "topic2": md_topic2,
              "topic3": md_topic3,
              "topic4": md_topic4,
              "topic5": md_topic5,
              "topic6": md_topic6}
month_districts = pd.DataFrame(md_dictionary)

In [None]:
# Display districts per month dataframe
month_districts

## Save Results to a CSV File

In [None]:
month_districts.to_csv("../outputs/month_districts.csv", encoding="utf-8")

In [None]:
# Verify results
topics_district_per_month[10] # month 7, district 1

In [None]:
# Verify results
district_per_month_pooling_docs[10] # month 7, district 1