In [None]:
from helper_functions import *

In [None]:
cluster_data = load_bin("clustering_data.pkl")

In [None]:
def get_sent(text):
    sents = []
    blob = TextBlob(text)
    for sentence in blob.sentences:
        sents.append(sentence.sentiment.polarity)
    return np.sum(sents)

In [None]:
# Attempt to label each center
# Also, calculate sentiment of the derived label
# and compare it to sentiment derived from all tweets in the cluster
# This is then used to calculate positive, negative, and toxic categories
num_grams = 20
num_words = 10

node_label = {}
node_sent = {}
toxic = 0
negative = 0
positive = 0
for index in range(len(cluster_data["centers"])):
    words = Counter()
    # Get a list of important non-stop words from tweets
    while len(words) < num_words:
        for x, c in cluster_data["words"][index].most_common():
            if x not in stopwords:
                words[x] = c
    # Get a list of the most common ngrams and svo triplets
    # reformat svo triplets to be same as ngrams
    summaries = Counter()
    for x, c in cluster_data["svo"][index].most_common(num_grams):
        x = " ".join(x[1:-1].split(", "))
        summaries[x] += c
    for x, c in cluster_data["ngrams"][index].most_common(num_grams):
        summaries[x] += c
    # If a word is found in a summary, assign it to potential labels counter
    # with count equal to the frequency of the matched word
    labels = Counter()
    for word, count in words.most_common(num_words):
        for x, c in summaries.most_common(num_grams):
            if word in x:
                labels[x] += count
    # Create bigrams of commonly seen words
    # If a bigram is seen in a summary, assign it to potential labels counter
    # with count equal to the value of the word in the bigram with the highest frequenct
    word_combs = combinations([x for x, c in words.most_common(num_words)], 2)
    for comb in word_combs:
        fc = max([words[x] for x in comb])
        ws = " ".join(comb)
        for x, c in summaries.most_common(num_grams):
            if ws in x:
                labels[x] += fc
    # The top item found in labels is the summary
    # Note how we add the node index to the label
    # This is because some labels are identical
    # and thus cause the gephi creation step in a future cell
    # to miss nodes
    top_label = ""
    for x, c in labels.most_common(1):
        top_label = "[" + str(index) + "] " + x
    # This is used to label the item in gephi or other visualizations
    node_label[index] = top_label
    # Get the sentiment score of the label
    sent = get_sent(top_label)
    # Get the average sentiment score of the tweets in this cluster
    size = len(cluster_data["tweets"][index])
    tsent = cluster_data["sentiment"][index]/size
    # Assign verdicts based on sentiment analysis
    # the values were hand-adjusted based on manual inspection
    # of tweets in each cluster
    verd = ""
    if tsent < -0.1:
        node_sent[index] = 0
        verd = "TOXIC\t"
        toxic += size
    elif tsent < 0.1:
        node_sent[index] = 1
        verd = "NEGATIVE"
        negative += size
    elif tsent > 0.1:
        node_sent[index] = 2
        verd = "POSITIVE"
        positive += size

    # Print the results
    print("(" + str(size) + ")\t[" + "%.2f"%sent + "]\t" + "%.2f"%tsent + "\t" + verd + "\t" + top_label )

In [None]:
# Print a breakdown of categories
tot = positive+negative+toxic
posper = positive/tot*100
negper = negative/tot*100
toxper = toxic/tot*100
msg = "Positive: " + str(positive) + " (" + "%.2f"%posper + "%)"
msg += " Negative: " + str(negative) + " (" + "%.2f"%negper + "%)"
msg += " Toxic: " + str(toxic) + " (" + "%.2f"%toxper + "%)"
print(msg)

In [None]:
# Examine a cluster defined by target variable
# This cell prints tweets that don't contain terms identified
# during the previous labeling operation
# This is useful for manually inspecting the cluster to determine
# whether the rest of the tweets are similar in topic or context
target = 7
print("Cluster: " + node_label[target] + " contains " + str(len(cluster_data["tweets"][target])) + " tweets.")
terms = []
for word in node_label[target][4:].split():
    if word not in stopwords:
        terms.append(word)
print(terms)
print()
found = 0
for x, c in cluster_data["tweets"][target].most_common():
    matches = 0
    for t in terms:
        if t in x:
            matches += 1
    if matches == 0 :
        print("%.2f"%c + "\t" + x)
        found += 1
print("Found: " + str(found))

In [None]:
# Cluster the clusters!
# This allows us to visualize the resulting data in gephi
# This is also what was used to create the interactive demo
# https://twitter-clustering.web.app/
centers = cluster_data["centers"]
center_clusters, center_mapping = make_text_clusters(centers, edge_ratio=20)
com_counts = [len(c) for x, c in center_clusters.items()]
print("Communities: " + str(len(center_clusters)) + ": " + str(com_counts))
nodes = set()
for m, x in center_clusters.items():
    nodes.update(x)
print("Nodes: " + str(len(nodes)))
print("Edges: " + str(len(center_mapping)))

nodes_json = {}
center_node_attr = {}
for mod, nodes in center_clusters.items():
    for n in nodes:
        label = "n" + str(n)
        summary = node_label[n]
        sent = node_sent[n]
        size = cluster_data["sizes"][n]
        center_node_attr[summary] = [mod, size, sent]
        nodes_json[label] = {}
        nodes_json[label]["label"] = node_label[n]
        nodes_json[label]["community"] = mod
        nodes_json[label]["sentiment"] = cluster_data["sentiment"][n]
        nodes_json[label]["wfreq"] = cluster_data["words"][n]
        nodes_json[label]["words"] = get_wft(cluster_data["words"][n])
        nodes_json[label]["ngrams"] = get_wft(cluster_data["ngrams"][n])
        nodes_json[label]["svo"] = get_wft(cluster_data["svo"][n])
        nodes_json[label]["tweeted"] = print_counter_summary(cluster_data["sns"][n])
        nodes_json[label]["size"] = cluster_data["sizes"][n]
        nodes_json[label]["tweets"] = [x for x, c in cluster_data["tweets"][n].most_common(10)]
        nodes_json[label]["urls"] = [x for x, c in cluster_data["urls"][n].most_common(10)]

with open("edges.csv", "w") as f:
    f.write("Sourceid,Targetid,Weight\n")
    for m in center_mapping:
        s, t, w = m
        f.write("n"+str(s)+",n"+str(t)+","+str(w)+"\n")

geph_mapping = []
for item in center_mapping:
    s, t, w = item
    s1 = node_label[s]
    t1 = node_label[t]
    geph_mapping.append([s1, t1, w])

save_json(nodes_json, "nodes.json")
write_gexf(geph_mapping, "center_mapping.gexf", center_node_attr, ["community", "size", "sentiment"])
print("Done")

In [None]:
# Print out some information about each cluster
# Most common words, ngrams, svo triplets
cluster_words = {}
cluster_ngrams = {}
cluster_svo = {}
for mod, nodes in center_clusters.items():
    cluster_words[mod] = Counter()
    cluster_ngrams[mod] = Counter()
    cluster_svo[mod] = Counter()
    for n in nodes:
        words = cluster_data["words"][n]
        ngrams = cluster_data["ngrams"][n]
        svo = cluster_data["svo"][n]
        for x, c in words.items():
            if x not in stopwords:
                cluster_words[mod][x] += c
        for x, c in ngrams.items():
            cluster_ngrams[mod][x] += c
        for x, c in svo.items():
            cluster_svo[mod][x] += c
# Print top words
print("Words")
for mod, words in cluster_words.items():
    top_words = " "
    for x, c in words.most_common(10):
        top_words += x + "(" + str(c) + ") "
    
    print("Community " + str(mod) + top_words)
print()
# Print top ngrams
print("ngrams")
for mod, ngrams in cluster_ngrams.items():
    top_ngrams = " "
    for x, c in ngrams.most_common(5):
        top_ngrams += x + "(" + str(c) + ") "
    
    print("Community " + str(mod) + top_ngrams)
print()
# Print top svo
print("svo")
for mod, svo in cluster_svo.items():
    top_svo = " "
    for x, c in svo.most_common(5):
        top_svo += x + "(" + str(c) + ") "
    
    print("Community " + str(mod) + top_svo)

In [None]:
# Attempt to find best label for each community
# This uses a similar method to one above, but with different parameters
num_grams = 10
num_words = 10
for mod, svo in cluster_svo.items():
    words = cluster_words[mod]
    ngrams = cluster_ngrams[mod]
    relevant_svo = Counter()
    for word, count in words.most_common(num_words):
        for x, c in svo.most_common(num_grams):
            x = " ".join(x[1:-1].split(", "))
            if word in x:
                relevant_svo[x] += count
        for x, c in ngrams.most_common(num_grams):
            if word in x:
                relevant_svo[x] += count
    word_combs = combinations([x for x, c in words.most_common(num_words)], 2)
    for comb in word_combs:
        fc = max([words[x] for x in comb])
        ws = " ".join(comb)
        for x, c in svo.most_common(num_grams):
            x = " ".join(x[1:-1].split(", "))
            if ws in x:
                relevant_svo[x] += fc
        for x, c in ngrams.most_common(num_grams):
            if ws in x:
                relevant_svo[x] += fc
    
    top_svo = ""
    for x, c in relevant_svo.most_common(3):
        top_svo += x + " (" + str(c) + ") "
    top_svo = [x for x, c in relevant_svo.most_common(1)][0]
    
    print("Community " + str(mod) + ": " + top_svo)