<a href="https://colab.research.google.com/github/slz4025/twitter_latent_scams/blob/master/cluster_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

##Data

In [3]:
analyzed_dir = "/content/drive/My Drive/latent_scams/analyzed/"#@param {type: "string"}
file_name = "scam_scored.tsv"#@param {type: "string"}
body_key = "corpus_filtered"

In [4]:
import pandas as pd
import numpy as np

In [5]:
used_pd = pd.read_csv(analyzed_dir + file_name, sep="\t")

docs = used_pd[body_key]
docs = ["" if d != d else d for d in docs] # get rid of nan

# https://topsbm.readthedocs.io/en/latest/examples/example.html#Setup:-Load-a-corpus
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(token_pattern=r'\S+')
X = vec.fit_transform(docs)

##Perform

In [None]:
!echo "deb http://downloads.skewed.de/apt bionic main" >> /etc/apt/sources.list
!apt-key adv --keyserver keys.openpgp.org --recv-key 612DEFB798507F25
!apt-get update
!apt-get install python3-graph-tool python3-cairo python3-matplotlib
# TopSBM: https://topsbm.github.io/
!pip install topsbm

In [7]:
from topsbm import TopSBM
model = TopSBM(random_state=9)
Xt = model.fit_transform(X)

##Analyze

In [None]:
"""
document-nodes are on the left
word-nodes are on the right
different colors correspond to the different groups
-- https://topsbm.readthedocs.io/en/latest/examples/example.html#Setup:-Load-a-corpus
"""

model.plot_graph(n_edges=1000)

In [None]:
granularity = 0
M = model.groups_[granularity]
M.keys(), M["Bd"], M["Bw"]

In [None]:
# Topics: by words
topics = pd.DataFrame(M['p_w_tw'],
  index=vec.get_feature_names())
for topic in topics.columns:
  print("Group {}: ".format(topic))
  T = topics[topic]
  L = T[T > 0.01]
  L = L.nlargest(10)
  l_str = ""
  for l in L.index:
    if l[0] == "#":
      l = "\\" + l
    l = l.replace("_", "\\_")
    l_str += l + ", "
  print(l_str[:-2])
  print()

In [None]:
# Topics: by docs
from gensim import corpora
from sklearn.cluster import KMeans
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

doc_cluster_labels = pd.DataFrame(M['p_td_d']).idxmax(axis=0)
used_pd["topsbm_doc_cluster"] = doc_cluster_labels

for doc_topic in used_pd["topsbm_doc_cluster"].unique():
  df = used_pd[used_pd["topsbm_doc_cluster"] == doc_topic]

  print("Topic {}".format(doc_topic))

  group_docs = [d.split(' ') for d in df[body_key]]
  dictionary = corpora.Dictionary(group_docs)
  N = len(dictionary)
  def conv(bow):
    a = np.zeros(N)
    for w, f in bow:
      a[w] = f
    return a
  data = np.array([conv(dictionary.doc2bow(d)) for d in group_docs])

  # get approximate 10 most distant points
  n_reps = 10
  if df.shape[0] <= n_reps:
    n_reps = df.shape[0] - 1

  @ignore_warnings(category=ConvergenceWarning)
  def run():
    km = KMeans(n_clusters=n_reps, random_state=0).fit(data)
    clusters = km.labels_
    centroids = km.cluster_centers_

    def find_rep(data, C, i_centroid, cluster_labels):
      D = data[cluster_labels == i_centroid]
      if D.shape[0] == 0: return None
      # index is 1D so take first dim
      I = np.where(cluster_labels == i_centroid)[0]
      min_ind = np.argmin([np.linalg.norm(C-d) for d in D])
      true_min_ind = I[min_ind]
      return true_min_ind

    df_ = df.reset_index(drop=True)
    reps = []
    for i, C in enumerate(centroids):
        rep = find_rep(data, C, i, clusters)
        if rep:
          reps.append(rep)

    if len(reps) > 0:
      for rep in reps:
          print(df_.iloc[rep]["body"])
    else:
      for i, r in df.iterrows():
        print(r["body"])
    print()

  run()
  print()
  print()