In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
def separate_chunks(list_, n_per_chunk_):
    """Handle output from R"""
    for i in range(0, len(list_), n_per_chunk_):  
        yield list_[i:i + n_per_chunk_]
        

def docs_as_dists(topic_list_):
    """Represent each document as a distribution of topic probabilities"""
    topic_lengths = set(len(topic_) for topic_ in topic_list_)
    print(topic_lengths)
    assert len(topic_lengths) == 1
    response_vecs_ = []
    num_topics = len(topic_list_)
    print("{} topics".format(num_topics))
    topic_len = list(topic_lengths)[0]
    for i in range(topic_len):
        vec = []
        for j in range(num_topics):
            val = topic_list_[j][i]
            vec.append(val)
        response_vecs_.append(vec)
    return response_vecs_


def top_docs_per_topic(response_vecs_, n_):
    """Identify the document most strongly associated with each topic"""
    top_docs = []
    assert len(set(len(v) for v in response_vecs_)) == 1
    num_topics = len(response_vecs_[0])
    print("{} topics".format(num_topics))
    for i in range(num_topics):
        l_ = []
        for j in range(len(response_vecs_)):
            vec = response_vecs_[j]
            val = float(vec[i])
            tup = (val, j)
            l_.append(tup)
        l_ = sorted(l_, key=lambda x: x[0], reverse = True)
        top_n = l_[:n_]
        top_n = [tup[1] for tup in top_n]
        top_docs.append(top_n)
    return top_docs
        
        
def print_topics_with_top_responses(topic_list, top_docs, top_terms, responses_cleaned, num_responses=3):
    """Display the words and documents most strongly associated with each topic"""
    for i in range(len(topic_list)):
        indices = top_docs[i]
        print("Topic {}".format(i+1))
        print("-------\n")
        print(" ".join(top_terms[i]) + "\n")
        for j, index in enumerate(indices[:num_responses]):
            ex = "Ex. {}.{}".format(i+1, j+1)
            print(ex)
            print(responses_orig[index])
            print("\n")
        print()

## Load responses that were preprocessed in Python and used to train topic models in R

In [None]:
df = pd.read_csv("ngrams_df.csv")
responses_cleaned = df["cleaned"].values
responses_orig = df["original"].values
assert len(responses_cleaned) == len(responses_orig)
len(responses_cleaned)

In [None]:
df["index"] = [i for i in range(len(responses_cleaned))]

In [None]:
response_lengths = [len(response.split()) for response in responses_cleaned]
print("Mean no. words in responses: {:.1f}".format(np.mean(response_lengths)))
print("Median no. words in responses: {:.1f}".format(np.median(response_lengths)))

In [None]:
plt.hist(response_lengths, bins=100)
plt.show()

In [None]:
wc_lengths = [len(response.split()) for response in responses_cleaned]
plt.hist(wc_lengths, bins=50)
plt.show()

# 1 Topic model with k=15 (LDA)

In [None]:
top_terms15 = open("most_likely_terms15.txt", "r").read().strip().split("\n")
f = "topic_probs15.txt"
doc15 = open(f, "r").read().split()

topic_list15 = [chunk for chunk in separate_chunks(doc15, 988)]
response_vecs15 = docs_as_dists(topic_list15)
top_docs15 = top_docs_per_topic(response_vecs15, 5)

### 1.1 Compare results to results from R

In [None]:
# Compare Python results to the top topic for the first n documents according to R

comparison15 = "" # compare to first n results in most_likely_topic_in_doc_15 in R
comparison15 = [int(i) for i in comparison15.split()]

for i in range(len(comparison15)):
    print(np.argmax(response_vecs15[i]) + 1, comparison15[i])

### 1.2 Top 10 terms for each topic in the model with k=15

In [None]:
top_terms15 = list(separate_chunks(top_terms15, 10))

for i, t in enumerate(top_terms15):
    t = " ".join(t)
    print("Topic {}: {}".format(i+1, t))

In [None]:
print(len(topic_list15))
print(len(top_docs15))
print(len(top_terms15))
print(len(responses_cleaned))

### 1.3 Representative documents for each topic

In [None]:
print_topics_with_top_responses(topic_list15, top_docs15, top_terms15, responses_cleaned, 3)

# 2 Topic model with k=25 (LDA)

In [None]:
top_terms25 = open("most_likely_terms25.txt", "r").read().strip().split("\n")
f = "topic_probs25.txt"
doc25 = open(f, "r").read().split()

topic_list25 = [chunk for chunk in separate_chunks(doc25, 988)]
response_vecs25 = docs_as_dists(topic_list25)
top_docs25 = top_docs_per_topic(response_vecs25, 5)

### 2.1 Compare results to results from R

In [None]:
# Compare Python results to the top topic for the first n documents according to R

comparison25 = "" # compare to first n results in most_likely_topic_in_doc_25 in R
comparison25 = [int(i) for i in comparison25.split()]

for i in range(len(comparison25)):
    print(np.argmax(response_vecs25[i]) + 1, comparison25[i])

### 2.2 Top 10 terms for each topic in the model with k=25

In [None]:
top_terms25 = list(separate_chunks(top_terms25, 10))

for i, t in enumerate(top_terms25):
    t = " ".join(t)
    print("Topic {}: {}".format(i+1, t))

In [None]:
print(len(topic_list25))
print(len(top_docs25))
print(len(top_terms25))
print(len(responses_cleaned))

### 2.3 Representative documents for each topic

In [None]:
print_topics_with_top_responses(topic_list25, top_docs25, top_terms25, responses_cleaned, 3)

# 3 Topic model with k=35 (LDA)

In [None]:
top_terms35 = open("most_likely_terms35.txt", "r").read().strip().split("\n")
f = "topic_probs35.txt"
doc35 = open(f, "r").read().split()

topic_list35 = [chunk for chunk in separate_chunks(doc35, 988)]
response_vecs35 = docs_as_dists(topic_list35)
top_docs35 = top_docs_per_topic(response_vecs35, 5)

### 3.1 Compare results to results from R

In [None]:
# Compare Python results to the top topic for the first n documents according to R

comparison35 = "" # compare to first n results in most_likely_topic_in_doc_35 in R
comparison35 = [int(i) for i in comparison35.split()]

for i in range(len(comparison35)):
    print(np.argmax(response_vecs35[i]) + 1, comparison35[i])

### 3.2 Top 10 terms for each topic in the model with k=35

In [None]:
top_terms35 = list(separate_chunks(top_terms35, 10))

for i, t in enumerate(top_terms35):
    t = " ".join(t)
    print("Topic {}: {}".format(i+1, t))

In [None]:
print(len(topic_list35))
print(len(top_docs35))
print(len(top_terms35))
print(len(responses_cleaned))

### 3.3 Representative documents for each topic

In [None]:
print_topics_with_top_responses(topic_list35, top_docs35, top_terms35, responses_cleaned, 3)