This notebook contains some example analyses on the resulting cluster output data

In [None]:
from helper_functions import *
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(palette="husl")
sns.set(style="whitegrid")

In [None]:
# Load saved data
full = load_bin("clustering_data.pkl")
print("Done")

In [None]:
# Show a histogram of cluster sizes
counts = []
indices = []
labels = []
max_size = 1

for i, x in enumerate(sorted(full["sizes"], reverse=True)):
    if i % 10 == 0:
        labels.append(x)
    else:
        labels.append("")
    if x > max_size:
        indices.append(i)
        counts.append(x)
fig = plt.figure(figsize=(18,10))
ax = sns.barplot(y=counts, x=indices, palette="husl", ci=None)
ax.set_title("Cluster sizes (above "+str(max_size)+" in size)")
y = ax.set(xticklabels=labels)

In [None]:
# Show counts of clusters which have similar sizes
d = Counter()
for i, x in enumerate(full["sizes"]):
    d[x] += 1

ordered = []
for w in sorted(d):
    ordered.append([w, d[w]])
counts = []
indices = []
labels = []
for item in ordered:
    if item[1] > 1:
        counts.append(item[1])
        indices.append(item[0])
        labels.append(item[0])
fig = plt.figure(figsize=(18,10))
ax = sns.barplot(y=counts, x=indices, palette="husl", ci=None)
ax.set_title("Counts of clusters with similar sizes.")
y = ax.set(xticklabels=labels)
y = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

In [None]:
# Show words common to clusters larger than max_size
max_size = 100
word_counts = Counter()
for index, size in enumerate(full["sizes"]):
    if size >= max_size:
        for x, c in Counter(full["words"][index]).most_common():
            word_counts[x] += c
counts = []
sizes = []
added = 0
for x, c in word_counts.most_common():
    if x not in stopwords:
        sizes.append(c)
        counts.append(str(x) + " (" + str(c) + ")")
        added += 1
        if added > 20:
            break

fig = plt.figure(figsize=(18,10))
ax = fig.add_axes((0,0,.5,1))

ax.set_title("Most common words in clusters smaller than 100")
plt.pie(sizes, labels=counts, startangle=0)
plt.axis('equal')
plt.show()

In [None]:
# Show words common to clusters smaller than max_size
word_counts = Counter()
for index, size in enumerate(full["sizes"]):
    if size < max_size:
        for x, c in Counter(full["words"][index]).most_common():
            word_counts[x] += c
counts = []
sizes = []
added = 0
for x, c in word_counts.most_common():
    if x not in stopwords:
        sizes.append(c)
        counts.append(str(x) + " (" + str(c) + ")")
        added += 1
        if added > 20:
            break

fig = plt.figure(figsize=(18,8))
ax = fig.add_axes((0,0,.5,1))

ax.set_title("Most common words in clusters smaller than 100")
plt.pie(sizes, labels=counts, startangle=0)
plt.axis('equal')
plt.show()

In [None]:
# Print common words and counts for larger clusters as pie chart
sizes = []
labels = []
max_size = 300

for i, s in enumerate(full["sizes"]):
    if s > max_size:
        count = 0
        label = ""
        for x, c in full["words"][i].most_common():
            if x not in stopwords:
                label += x + "(" + str(c) + ") "
                count += 1
                if count % 5 == 0:
                    label += "\n"
                if count >= 15:
                    break
        labels.append(label)
        sizes.append(s)

fig = plt.figure(figsize=(20,10))
ax = fig.add_axes((0,0,.5,1))

plt.pie(sizes, labels=labels, startangle=20)
plt.axis('equal')
plt.show()

In [None]:
# Print details about larger clusters as a horizontal bar chart
sizes = []
labels = []
sthreshold = 0.1
max_size = 200

count = 0
for i, s in enumerate(full["sizes"]):
    if s > max_size:
        count += 1
        svo = ""
        if len(full["svo"][i]) > 0:
            svo = " ".join([x for x, c in Counter(full["svo"][i]).most_common(3)])
        top_words = [x for x, c in full["words"][i].most_common() if x not in stopwords]

        sent = " -POS-"
        if full["sentiment"][i]/s < sthreshold:
            sent = " -NEG-"
        label = ""
        label += "[" + str(i) + "] " 
        label += svo 
        #label += sent 
        label += "\n"
        label += " / ".join([x for x, c in Counter(full["ngrams"][i]).most_common(5)])
        label += "\n"
        label += ", ".join(top_words[:10]) + "\n"
        labels.append(label)
        sizes.append(s)

jsondata = []
for i, s in enumerate(sizes):
    label = labels[i]
    jsondata.append([s, label])

plot_data = {}
plot_data["labels"] = []
plot_data["counts"] = []
for item in jsondata:
    plot_data["counts"].append(item[0])
    plot_data["labels"].append(item[1])
height = len(plot_data["counts"])*2
sns.set_style("white",  {'axes.spines.bottom': False,
                         'axes.spines.left': False,
                         'axes.spines.right': False,
                         'axes.spines.top': False})
fig = plt.figure(figsize=(10,25))
ax = sns.barplot(x="counts", y="labels", palette="husl", data=plot_data)
y = ax.set(yticklabels=[])
y = ax.set(xticklabels=[])
for i, v in enumerate(plot_data["counts"]):
    pad = min(25.0, v/100)
    ax.text(v+pad, i+0.55, str(plot_data["labels"][i]))
    ax.text(20, i, str(v), fontweight='bold')

In [None]:
# Print details about negative and positive clusters
def print_fc(cvar, num):
    outs = ""
    col = 0
    for x, c in cvar.most_common(num):
        if col > 90:
            outs += "\n"
            col = 0
        outs += str(x) + " / "
        col += len(str(x)) + 3
    print(outs)

pos = []
postot = 0
pos_svo = Counter()
pos_ngram = Counter()
pos_word = Counter()
neg = []
negtot = 0
neg_svo = Counter()
neg_ngram = Counter()
neg_word = Counter()
for i, s in enumerate(full["sizes"]):
    sent = full["sentiment"][i]/s
    sv = [x for x, c in full["svo"][i].most_common()]
    ng = [x for x, c in full["ngrams"][i].most_common()]
    wd = [x for x, c in full["words"][i].most_common()]
    if sent > sthreshold:
        pos.append(s)
        if len(sv) > 0:
            pos_svo[sv[0]] += 1
        for n in ng[:5]:
            pos_ngram[n] += 1
        cnt = 0
        for w in wd:
            if w not in stopwords:
                pos_word[w] += 1
                cnt += 1
                if cnt > 5:
                    break
    else:
        neg.append(s)
        if len(sv) > 0:
            neg_svo[sv[0]] += 1
        for n in ng[:5]:
            neg_ngram[n] += 1
        cnt = 0
        for w in wd:
            if w not in stopwords:
                neg_word[w] += 1
                cnt += 1
                if cnt > 5:
                    break
itemc = 15
print("Positive clusters: " + str(len(pos)) + " - " + str(sum(pos)) + " tweets.")
print("svo")
print_fc(pos_svo, itemc)
print()
print("ngram")
print_fc(pos_ngram, itemc)
print()
print("word")
print_fc(pos_word, itemc)

print()
print("Negative clusters: " + str(len(neg)) + " - " + str(sum(neg)) + " tweets.")
print("svo")
print_fc(neg_svo, itemc)
print()
print("ngram")
print_fc(neg_ngram, itemc)
print()
print("word")
print_fc(neg_word, itemc)

In [None]:
# This function extracts cluster ids containing search terms
# and displays details about the top 5 most relevant clusters
terms = ["liar", "criminal", "idiot", "fool", "ignorant", "delusional"]
found = Counter()
for index in range(len(full["tweets"])):
    for x, c in full["tweets"][index].most_common():
        for term in terms:
            if term in x:
                found[index] += 1
print("Found " + str(len(found)) + " clusters contained the terms: \"" + ", ".join(terms) + "\".")

cluster_per = Counter()
cluster_matches = Counter()
for x, c in found.most_common():
    size = full["sizes"][x]
    matches = c
    per = (matches/size) * 100
    cluster_per[x] = per
    cluster_matches[x] = matches

targets = [x for x, c in cluster_per.most_common(5)]
print()
for t in targets:
    msg = "Cluster " + str(t) + " (size " + str(full["sizes"][t]) + ") contained " 
    msg += str(cluster_matches[t]) + " tweets (" + "%.2f"%cluster_per[t] + "%) that included the terms: \"" 
    msg += ", ".join(terms) + "\"."
    print(msg)
    tm = ""
    tc = 0
    for x, c in full["words"][t].most_common():
        if x not in stopwords:
            tm += x + "(" + str(c) + ") "
            tc += 1
        if tc >= 10:
            break
    print(tm)
    print()
    for x, c in full["tweets"][t].most_common(10):
        print("%.3f"%c + ": " + x)
    print()