In [26]:
import pandas as pd
import gensim
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from gensim.models.coherencemodel import CoherenceModel
import time

## Train

In [1]:
COUNTRY = ["China", "France", "Germany", "Japan", "Korea", "Vietnam"]
TIMESTAMP = ["pre2018", "2018", "2019", "2020", "2021"]

In [28]:
def train(country="China", timestamp="2021", test_size=0.2, min_topic=40, max_topic=100):
    # read data
    data_file = "Data/Timestamp/" + country + "_" + timestamp + ".txt"
    with open(data_file) as file:
        lines = file.readlines()
    print("Read data from path: " + data_file)
    
    # tokenize
    corpus = [list(gensim.utils.tokenize(line, deacc = True)) for line in lines]
    
    # get corpus
    corpus = pd.DataFrame({'reviews': corpus})
    docs = corpus['reviews']
    
    # # train test split
    # train, test = train_test_split(corpus, test_size=test_size)
    # docs, docs_train, docs_test = corpus['reviews'], train['reviews'], test['reviews']
    
    # vocabulary / dictionary
    dictionary = gensim.corpora.Dictionary(docs)
    
    # BOW
    bow = [dictionary.doc2bow(doc) for doc in docs]
    # bow_train = [dictionary.doc2bow(doc) for doc in docs_train]
    # bow_test = [dictionary.doc2bow(doc) for doc in docs_test]
    
    # tfidf
    tfidf = gensim.models.TfidfModel(bow)
    # tfidf_train = gensim.models.TfidfModel(bow_train)
    # tfidf_test = gensim.models.TfidfModel(bow_test)

    tfidf = tfidf[bow]
    # tfidf_train = tfidf_train[bow_train]
    # tfidf_test = tfidf_test[bow_test]
    
    # set up file for saving data
    directory = "Data/NMF/Result_timestamp/"
    if not os.path.exists(directory):
        os.makedirs(directory)
    file_name_umass = directory + country + "_" + timestamp + "_result_umass.csv"
    # file_name_perplexity = directory + country + "_" + timestamp + "_result_perplexity.csv"
    # file_name_perplexity_train = "Data/Result_timestamp/" + country + "_" + timestamp + "_" + str(no_iterations) +"_result_perplexity_train.csv"
    # file_name_perplexity_test = "Data/Result_timestamp/" + country + "_" + timestamp + "_" + str(no_iterations) +"_result_perplexity_test.csv"

    file_write_umass = open(file_name_umass, "a")
    # file_write_perplexity = open(file_name_perplexity, "a")
    # file_write_perplexity_train = open(file_name_perplexity_train, "a")
    # file_write_perplexity_test = open(file_name_perplexity_test, "a")

    for no_topics in range(min_topic,max_topic + 1):

        start = time.time()
        nmf_model_tfidf = gensim.models.nmf.Nmf(tfidf, 
                                                 num_topics=no_topics, 
                                                 id2word = dictionary, 
                                                 passes = 2)
        end = time.time()
        
        # save umass result
        u_mass = CoherenceModel(model=nmf_model_tfidf, corpus=bow, dictionary=dictionary ,coherence='u_mass', topn=20)
        u_mass_res = u_mass.get_coherence() 

        # save perplexity result
        # perplexity = nmf_model_tfidf.log_perplexity(tfidf, len(docs))
        # perplexity_train = lda_model_tfidf.log_perplexity(tfidf_train, len(docs_train))
        # perplexity_test = lda_model_tfidf.log_perplexity(tfidf_test, len(docs_test))
        
        # store data to file
        file_write_umass.write(str(no_topics) + "," + str(u_mass_res) +","+str(end-start)+"\n")
        # file_write_perplexity.write(str(no_topics) + "," + str(perplexity) +","+str(end - start)+"\n")
        # file_write_perplexity_train.write(str(no_topics) + "," + str(perplexity_train) +","+str(end - start)+"\n")
        # file_write_perplexity_test.write(str(no_topics) + "," + str(perplexity_test) +","+str(end - start)+"\n")

    file_write_umass.close()
    # file_write_perplexity.close()
    # file_write_perplexity_train.close()
    # file_write_perplexity_test.close()

In [29]:
for country in COUNTRY:
    for timestamp in TIMESTAMP:
        train(country=country, timestamp=timestamp, test_size=0.2, min_topic=40, max_topic=100)

Read data from path: Data/Timestamp/China_pre2018.txt
Read data from path: Data/Timestamp/China_2018.txt
Read data from path: Data/Timestamp/China_2019.txt
Read data from path: Data/Timestamp/China_2020.txt
Read data from path: Data/Timestamp/China_2021.txt
Read data from path: Data/Timestamp/France_pre2018.txt
Read data from path: Data/Timestamp/France_2018.txt
Read data from path: Data/Timestamp/France_2019.txt
Read data from path: Data/Timestamp/France_2020.txt
Read data from path: Data/Timestamp/France_2021.txt
Read data from path: Data/Timestamp/Germany_pre2018.txt
Read data from path: Data/Timestamp/Germany_2018.txt
Read data from path: Data/Timestamp/Germany_2019.txt
Read data from path: Data/Timestamp/Germany_2020.txt
Read data from path: Data/Timestamp/Germany_2021.txt
Read data from path: Data/Timestamp/Japan_pre2018.txt
Read data from path: Data/Timestamp/Japan_2018.txt
Read data from path: Data/Timestamp/Japan_2019.txt
Read data from path: Data/Timestamp/Japan_2020.txt
Read

In [63]:
def save_output(topic_data, country="China", timestamp="2021", test_size=0.2):
    # read data
    data_file = "Data/Timestamp/" + country + "_" + timestamp + ".txt"
    with open(data_file) as file:
        lines = file.readlines()
    print("Read data from path: " + data_file)
    
    # tokenize
    corpus = [list(gensim.utils.tokenize(line, deacc = True)) for line in lines]
    
    # get corpus
    corpus = pd.DataFrame({'reviews': corpus})
    docs = corpus['reviews']
    
    # # train test split
    train, test = train_test_split(corpus, test_size=test_size)
    docs, docs_train, docs_test = corpus['reviews'], train['reviews'], test['reviews']
    
    # vocabulary / dictionary
    dictionary = gensim.corpora.Dictionary(docs)
    
    # BOW
    bow = [dictionary.doc2bow(doc) for doc in docs]
    bow_train = [dictionary.doc2bow(doc) for doc in docs_train]
    bow_test = [dictionary.doc2bow(doc) for doc in docs_test]
    
    # tfidf
    tfidf = gensim.models.TfidfModel(bow)
    tfidf_train = gensim.models.TfidfModel(bow_train)
    tfidf_test = gensim.models.TfidfModel(bow_test)

    tfidf = tfidf[bow]
    tfidf_train = tfidf_train[bow_train]
    tfidf_test = tfidf_test[bow_test]
    
    # number of topics
    file_id = "Data/NMF/Result_timestamp/" + country + "_" + timestamp + "_result_umass.csv"

    # find topic
    row = topic_data.loc[topic_data["file"] == file_id]
    no_topics = row["topic"].iloc[0]
    
    nmf_model_tfidf = gensim.models.nmf.Nmf(tfidf, 
                                             num_topics=no_topics, 
                                             id2word = dictionary, 
                                             passes = 2)
    
    directory = "Data/NMF/Output_Timestamp/" + country + "_" + timestamp
    file_output = directory  + "/" + country + "_" + timestamp
    if not os.path.exists(directory):
        os.makedirs(directory)

    # plot and save
    
    plt.figure()
    for t in range(nmf_model_tfidf.num_topics):
        plt.imshow(WordCloud(background_color='white', colormap='Oranges').fit_words(dict(nmf_model_tfidf.show_topic(t, 200))))
        plt.axis("off")
        plt.title("Topic #" + str(t))
        plt.savefig(file_output + "_topic_" + str(t))
        plt.clf()
    plt.close()

In [64]:
topic_data = pd.read_csv('NMF_topic_timestamp.txt',delimiter=',')
for country in COUNTRY:
    for timestamp in TIMESTAMP:
        save_output(topic_data=topic_data, country=country, timestamp=timestamp)

Read data from path: Data/Timestamp/China_pre2018.txt
Read data from path: Data/Timestamp/China_2018.txt
Read data from path: Data/Timestamp/China_2019.txt
Read data from path: Data/Timestamp/China_2020.txt
Read data from path: Data/Timestamp/China_2021.txt
Read data from path: Data/Timestamp/France_pre2018.txt
Read data from path: Data/Timestamp/France_2018.txt
Read data from path: Data/Timestamp/France_2019.txt
Read data from path: Data/Timestamp/France_2020.txt
Read data from path: Data/Timestamp/France_2021.txt
Read data from path: Data/Timestamp/Germany_pre2018.txt
Read data from path: Data/Timestamp/Germany_2018.txt
Read data from path: Data/Timestamp/Germany_2019.txt
Read data from path: Data/Timestamp/Germany_2020.txt
Read data from path: Data/Timestamp/Germany_2021.txt
Read data from path: Data/Timestamp/Japan_pre2018.txt
Read data from path: Data/Timestamp/Japan_2018.txt
Read data from path: Data/Timestamp/Japan_2019.txt
Read data from path: Data/Timestamp/Japan_2020.txt
Read