<a href="https://colab.research.google.com/github/slz4025/twitter_latent_scams/blob/master/cluster_scorer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np

## Data 
Load and select two groups: (1) likely scam related and (2) likely not scam related

In [None]:
analyzed_dir = "/content/drive/My Drive/latent_scams/analyzed/"#@param {type: "string"}
file_name = "processing_all_with_score_filtered_covid_20200602.tsv"#@param {type: "string"}
body_key = "corpus_filtered"

In [None]:
def get_data(threshold):
  tweets_pd = pd.read_csv(analyzed_dir + file_name, sep="\t")
  used_pd = tweets_pd[tweets_pd["score"] > threshold]
  used_pd = used_pd.reset_index()

  docs = used_pd[body_key]
  docs = ["" if d != d else d for d in docs] # get rid of nan

  # https://topsbm.readthedocs.io/en/latest/examples/example.html#Setup:-Load-a-corpus
  from sklearn.feature_extraction.text import CountVectorizer
  vec = CountVectorizer(token_pattern=r'\S+')
  X = vec.fit_transform(docs)

  return used_pd, docs, vec, X

In [None]:
threshold = 1 # choose first-pass scorer threshold to examine
used_pd, docs, vec, X = get_data(threshold)

## TopSBM

###Perform

In [None]:
!echo "deb http://downloads.skewed.de/apt bionic main" >> /etc/apt/sources.list
!apt-key adv --keyserver keys.openpgp.org --recv-key 612DEFB798507F25
!apt-get update
!apt-get install python3-graph-tool python3-cairo python3-matplotlib
# TopSBM: https://topsbm.github.io/
!pip install topsbm

In [None]:
from topsbm import TopSBM

model = TopSBM(random_state=9)
Xt = model.fit_transform(X)

In [None]:
"""
document-nodes are on the left
word-nodes are on the right
different colors correspond to the different groups
-- https://topsbm.readthedocs.io/en/latest/examples/example.html#Setup:-Load-a-corpus
"""

model.plot_graph(n_edges=1000)

In [None]:
""" https://topsbm.readthedocs.io/en/stable/api.html
B_d : int
number of doc-groups

B_w : int
number of word-groups

p_tw_d : array of shape (B_w, d)
doc-topic mixtures: prob of word-group tw in doc d P(tw | d)

p_td_d : array of shape (B_d, n_samples)
doc-group membership: prob that doc-node d belongs to doc-group td: P(td | d)

p_tw_w : array of shape (B_w, n_features)
word-group-membership: prob that word-node w belongs to word-group tw: P(tw | w)

p_w_tw : array of shape (n_features, B_w)
topic distribution: prob of word w given topic tw P(w | tw)
"""

granularity = 0
M = model.groups_[granularity]
M.keys(), M["Bd"], M["Bw"]

In [None]:
np.save(analyzed_dir + "topsbm_model_results/topsbm_model_results_lim{}.npy".format(ver), M)

### Analyze

In [None]:
ver = 1 #@param {type: "integer"}
used_pd, docs, vec, X = get_data(ver)
M = np.load(analyzed_dir + "topsbm_model_results/topsbm_model_results_lim{}.npy".format(ver), allow_pickle=True)
M = M[()]

In [None]:
used_pd.shape, M["Bd"], M["Bw"]

In [None]:
# identified scam-relevant clusters for the following thresholds for June 2, 2020
potential = {
    0: [26,27,28,40,45,48,50,61,107,109,130,181],
    1: [9,11,14,16,18,28,29,36,44,52,53,56,65,76,79,84],
    2: [3,6,8,9,16,17,18,23,29],
    3: [2,3,7,8,9,10,12,13,14,15,16,17,18,19,20,21,22,28,29,30,32,34,35],
}
potential = potential[ver]
# vector of relevant scam-clusters
pot_vec = np.zeros(M["Bw"])
pot_vec[potential] = 1.0

In [None]:
# Topics: by words
relevance_threshold = 0.01 # threshold for the probability a word is in the word group
topics = pd.DataFrame(M['p_w_tw'],
  index=vec.get_feature_names())
for topic in topics.columns:
  #if topic not in potential: continue
  print(topic)
  L = topics[topic].nlargest(10)
  L = L[L > relevance_threshold]
  for l in L.index:
    if l[0] == "#":
      l = "\\" + l
    l = l.replace("_", "\\_")
    print(l, "\\\\")
  print()

In [None]:
relevancy_scores = [np.dot(m, pot_vec) for m in M['p_tw_d'].T]
any(r > 1.0 for r in relevancy_scores) # check probability is reasonable

False

In [None]:
# Clusters for documents based on word-groupings
word_cluster_labels = pd.DataFrame(M['p_tw_d']).idxmax(axis=0)
word_cluster_labels.head()

In [None]:
# Clusters for documents based on document-groupings
doc_cluster_labels = pd.DataFrame(M['p_td_d']).idxmax(axis=0)
doc_cluster_labels.head()

In [None]:
used_pd["topsbm_word_cluster"] = word_cluster_labels
used_pd["topsbm_doc_cluster"] = doc_cluster_labels
used_pd["scam_relevancy_score"] = relevancy_scores
used_pd["topsbm_word_cluster"].unique(), used_pd["topsbm_doc_cluster"].unique()

In [None]:
# Topics: by docs
for topic in used_pd["topsbm_doc_cluster"].unique():
  df = used_pd[used_pd["topsbm_doc_cluster"] == topic]
  print(topic, df.shape[0])
  print(df["corpus_filtered"])
  print()

In [None]:
cluster_file_name = "clustered_{}_".format(ver) + file_name
used_pd.to_csv(sep='\t', path_or_buf=analyzed_dir + cluster_file_name)

In [None]:
import matplotlib.pyplot as plt
plt.scatter(used_pd["score"], used_pd["scam_relevancy_score"])
plt.xlabel("keyword_score")
plt.ylabel("cluster_score")