In [None]:
from helper_functions import *

num_grams = 3

In [None]:
# Load saved data from previous step
print("Loading combined vectors")
combined_vecs = load_bin("preprocessed/combined_vecs.pkl")
print(len(combined_vecs))
print("Loading texts")
base_texts = load_json("preprocessed/texts.json")
print(len(base_texts))
print("Loading sns")
screen_names = load_json("preprocessed/sns.json")
print(len(screen_names))
print("Loading ids")
id_strs = load_json("preprocessed/ids.json")
print(len(id_strs))
print("Done")

In [None]:
# This cell performs the clustering

# Parameters used by the algorithm
# See the blog post for details about these
batch_size = 10000
total_samples = 10000
edge_ratio = 3
min_cluster_size = 50
merge_similarity = 0.98
similarity_threshold = 0

msg = "TARGET:" + str(total_samples) 
msg += " BS:" + str(batch_size)
msg += " ER:" + str(edge_ratio)
msg += " STH:" + str(similarity_threshold)
msg += " MSIM:" + str(merge_similarity)
msg += " MCS:" + str(min_cluster_size)
print(msg)

# State information is saved in these variables
# and used in later cells to generate output
centers = []
center_labels = []
center_words = []
center_ngrams = []
center_svo = []
center_sentiment = []
center_sns = []
center_sizes = []
center_urls = []
center_tweets = []

vec_label = []
item_mod = {}

ncindices = []
final_mapping = []


used = set()
mod_num = 0
passes = 0
merges = 0
total_processed = 0
finished = False

# If this is true, samples will be randomly selected from the whole set
# If this is false, samples will be selected sequentially starting from current_index
get_random = True

# If this is true, the start index for sequential sampling will be randomly selected
start_random = True
current_index = 0
if start_random == True:
    end_ind = max(0, (len(base_texts) - total_samples+1))
    if end_ind > 0:
        current_index = random.randint(0, end_ind)

if get_random == True:
    print("Using random sampling.")
else:
    print("Using sequential sampling, starting at index: " + str(current_index))

while finished == False:    
    if total_processed < total_samples:
        num_to_add = batch_size - len(ncindices)
        
        # If this got stuck and didn't create any new clusters, truncate the list and add new stuff
        if num_to_add == 0 and passes > 0:
            samples_left = len(base_texts) - current_index
            num_to_remove = min(samples_left, round(batch_size/10))
            ncindices = ncindices[num_to_remove:]
            num_to_add = num_to_remove
        
        # Add new samples to batch
        count = 0
        if get_random == True:
            while count < num_to_add:
                rindex = random.randint(0,len(base_texts)-1)
                if rindex not in used:
                    used.add(rindex)
                    ncindices.append(rindex)
                    count += 1
        else:
            while count < num_to_add:
                ncindices.append(current_index)
                current_index += 1
                if current_index > len(base_texts):
                    finished = True
                    break
                count += 1

    ncvectors = [combined_vecs[i] for i in ncindices]
    
    clusters, mapping = make_text_clusters(ncvectors, 
                                           edge_ratio=edge_ratio, 
                                           threshold=similarity_threshold)

    # Build mapping for gephi visualization
    nodes_to_omit = set()
    for mod, idl in clusters.items():
        if len(idl) <= min_cluster_size:
            for node in idl:
                nodes_to_omit.add(node)
    for m in mapping:
        x, y, c = m
        if x not in nodes_to_omit and y not in nodes_to_omit:
            final_mapping.append([ncindices[x], ncindices[y], c])
            
    # Renumber clusters to include actual data indices
    clustered = set()
    rclusters = {}
    for mod, idl in clusters.items():
        rclusters[mod] = []
        for i in idl:
            orig_index = ncindices[i]
            rclusters[mod].append(orig_index)
            clustered.add(orig_index)

    # Create a list of unclustered samples
    not_clustered = set(ncindices).difference(clustered)
    new_ncindices = list(not_clustered)
    
    # Iterate through identified clusters
    for mod, idl in sorted(rclusters.items()):
        # Check if cluster matches min_cluster_size
        if len(idl) >= min_cluster_size:
            texts = [base_texts[index] for index in idl]
            sns = [screen_names[index] for index in idl]
            ids = [id_strs[index] for index in idl]
            vectors = [combined_vecs[index] for index in idl]
            center = get_cluster_center(vectors)
            
            # Check if this cluster has a similar center to any other already found
            cluster_mod = None
            new_cluster = False
            sc = 0
            if len(centers) > 1:
                scores = fast_cosine_matrix(np.array(center), np.array(centers))
                sc = np.max(scores)

            if sc >= merge_similarity:
                cluster_mod = np.argmax(scores)
                merges += 1
            else:
                cluster_mod = mod_num
                new_cluster = True
                mod_num += 1

            # This is used for gephi visualization
            for c, item in enumerate(idl):
                item_mod[item] = [cluster_mod]

            # Measure the distance of items to the cluster center
            indices, rtweets, rurls = get_cluster_relevance(texts, vectors, sns, ids)
            
            # Save or update cluster size
            center_size = len(idl)
            if new_cluster == True:
                center_sizes.append(center_size)
            else:
                center_sizes[cluster_mod] += center_size

            # Get combined sentiment from texts in this cluster
            sentiment = get_sentiment(texts)
            if new_cluster == True:
                center_sentiment.append(sentiment)
            else:
                center_sentiment[cluster_mod] += sentiment

            # Get word frequencies from texts in this cluster
            wfreq = get_word_frequencies(texts)
            if new_cluster == True:
                center_words.append(wfreq)
            else:
                for x, c in wfreq.most_common():
                    center_words[cluster_mod][x] += c
            
            # Get subject, verb, object triples
            svo_triples = get_subject_verb_object_triples(texts)
            if new_cluster == True:
                center_svo.append(svo_triples)
            else:
                for x, c in svo_triples.most_common():
                    center_svo[cluster_mod][x] += c

            # Get common ngrams
            ngrams = get_ngram_frequencies(texts, num_grams)
            if new_cluster == True:
                center_ngrams.append(ngrams)
            else:
                for x, c in ngrams.most_common():
                    center_ngrams[cluster_mod][x] += c

            # Get label text (to be shown on graphviz)
            if new_cluster == True:
                center_label = get_label_text(texts, vectors)
                center_labels.append(center_label)

            # Add center to the list if it is new (else keep the existing one)
            if new_cluster == True:
                centers.append(center)

            # Add vectors to the list
            for v in vectors:
                vec_label.append([v, cluster_mod])

            # Add or update sn list
            snc = Counter(sns)
            if new_cluster == True:
                center_sns.append(snc)
            else:
                for x, c in snc.most_common():
                    center_sns[cluster_mod][x] += c

            # Update or add tweets by relevance
            if new_cluster == True:
                center_tweets.append(rtweets)
            else:
                for x, c in rtweets.most_common():
                    center_tweets[cluster_mod][x] = c

            # Update or add tweet urls by relevance
            if new_cluster == True:
                center_urls.append(rurls)
            else:
                for x, c in rurls.most_common():
                    center_urls[cluster_mod][x] = c
        else:
            # All stuff that was thrown away is added to the unclustered list
            new_ncindices.extend(idl)

    # Print some status output
    total_processed += (len(ncindices) - len(new_ncindices))
    if len(center_sizes) > 0:
        mean_size = np.mean(center_sizes)
        smallest = min(center_sizes)
        largest = max(center_sizes)
        msg = "Pass: " + str(passes)
        if get_random == False:
            msg += "  I:" + str(current_index)
        msg += "  N:" + str(total_processed) 
        msg += "  C:" + str(mod_num)
        msg += " Mean : " + "%.2f"%mean_size
        msg += " Min: " + str(smallest)
        msg += " Max: " + str(largest)
        msg += " Merges: " + str(merges)
        print(msg)
    ncindices = list(new_ncindices)
    passes += 1
    if total_processed > total_samples:
        finished = True
print("Done")

In [None]:
# This cell writes a readable text file containing a summary of the clustering process 
print("Writing summary. You can read it by opening tweet_graph_analysis_output.txt")
with open("tweet_graph_analysis_output.txt", "w") as f:
    for index in range(len(centers)):
        ngrams = center_ngrams[index]
        wfreq = center_words[index]
        svo = center_svo[index]
        sentiment = center_sentiment[index]
        tweeted = center_sns[index]
        size = center_sizes[index]
        tweets = center_tweets[index]
        f.write("\n")
        f.write("Cluster: " + str(index) + " contains: " + str(size) + " tweets.\n")
        f.write("Sentiment: " + "%.2f"%sentiment+"\n")
        wft = get_wft(wfreq)
        f.write("Words: " + wft + "\n")
        svoft = get_wft(svo)
        f.write("svo: "+svoft+"\n")
        nft = get_wft(ngrams)
        f.write("ngrams: " + nft+"\n")
        snt = print_counter_summary(tweeted)
        f.write("tweeted: " + snt +"\n")
        f.write("==================\n")
        tt = []
        for x, c in tweets.most_common():
            tt.append([c, x])
        for t in tt[:20]:
            f.write("%.3f"%t[0] + " " + t[1] +"\n")
        if len(tt) > 20:
            f.write("...\n")
            for t in tt[-5:]:
                f.write("%.3f"%t[0] + " " + t[1] +"\n")
        f.write("\n")
print("Done")

In [None]:
# This writes out a file that can be read by gephi
# WARNING! Consider commenting this out if you're planning on clustering a huge amount of tweets!
write_gexf(final_mapping, "tweet_mapping.gexf", item_mod, ["community"])

In [None]:
# Save data for subsequent analysis
full = {}
full["ngrams"] = center_ngrams
full["words"] = center_words
full["svo"] = center_svo
full["sentiment"] = center_sentiment
full["sns"] = center_sns
full["sizes"] = center_sizes
full["tweets"] = center_tweets
full["urls"] = center_urls
full["centers"] = centers
save_bin(full, "clustering_data.pkl")
print("Done")

In [None]:
# This function extracts cluster ids containing search terms
# and displays details about the top 5 most relevant clusters
terms = ["liar", "criminal", "idiot", "fool", "ignorant", "delusional"]
found = Counter()
for index in range(len(full["tweets"])):
    for x, c in full["tweets"][index].most_common():
        for term in terms:
            if term in x:
                found[index] += 1
print("Found " + str(len(found)) + " clusters contained the terms: \"" + ", ".join(terms) + "\".")

cluster_per = Counter()
cluster_matches = Counter()
for x, c in found.most_common():
    size = full["sizes"][x]
    matches = c
    per = (matches/size) * 100
    cluster_per[x] = per
    cluster_matches[x] = matches

targets = [x for x, c in cluster_per.most_common(5)]
print()
for t in targets:
    msg = "Cluster " + str(t) + " (size " + str(full["sizes"][t]) + ") contained " 
    msg += str(cluster_matches[t]) + " tweets (" + "%.2f"%cluster_per[t] + "%) that included the terms: \"" 
    msg += ", ".join(terms) + "\"."
    print(msg)
    tm = ""
    tc = 0
    for x, c in full["words"][t].most_common():
        if x not in stopwords:
            tm += x + "(" + str(c) + ") "
            tc += 1
        if tc >= 10:
            break
    print(tm)
    print()
    for x, c in full["tweets"][t].most_common(10):
        print("%.3f"%c + ": " + x)
    print()