In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))

import matplotlib.pyplot as plt
import numpy as np
import umap
import pandas as pd
from datasets import Dataset
import seaborn as sns

from src.mnn_umap import prepare_umap_graph, full_umap
from src.parsing import read_post_data, get_html_parser
from src.paths import get_project_root, abs_path

sys.path.append(abs_path("TextClusterVisualization/scripts"))

from graph_clustering import get_igraph_from_umap_graph, two_level_clustering, leiden_clustering

sns.set_style("whitegrid")

from tqdm.notebook import tqdm
tqdm.pandas()

# %env WANDB_NOTEBOOK_NAME=prepare_data_clean
import wandb
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
wandb.login() # relies on WANDB_API_KEY env var
run = wandb.init(
    project="ea-forum-analysis", job_type="processing", dir=get_project_root()
)

[34m[1mwandb[0m: Currently logged in as: [33mvpetukhov[0m. Use [1m`wandb login --relogin`[0m to force relogin


Ideas:
- Timeline with key posts on a topic
  - Add time dimension to my search engine?
- Convert dendrogram into an actual table of content
- Propagate post tags to users, show most active users per dendrogram branch
- Improve coloschemes
- Add time selection

Improving visualization:
- Try poincare embeddings
- Try sentence transformers instead of word2vec

In [3]:
# art = run.use_artifact("posts_encoded:all-mpnet-base-v2-baseline")
art = run.use_artifact("posts_encoded:all-mpnet-base-v2")
art.download()
posts_encoded = pd.read_csv(art.file(), index_col=0)
posts = Dataset.load_from_disk(run.use_artifact("posts_raw:latest").download()).to_pandas()

[34m[1mwandb[0m: Downloading large artifact posts_encoded:all-mpnet-base-v2, 98.61MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.0
[34m[1mwandb[0m: Downloading large artifact posts_raw:latest, 272.30MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.0


In [4]:
posts = posts.loc[posts_encoded.index]
posts = posts[posts.postedAt.dt.year > 2009]
posts_encoded = posts_encoded.loc[posts.index]

posts_encoded.shape

(10827, 768)

In [5]:
posts['text'] = posts['title'] + "\n\n" + posts['body']

## Prepare data

In [6]:
from graph_clustering import get_igraph_from_umap_graph, two_level_clustering
from preprocessing import normalize_text_doc, create_text_corp
from keyword_extraction import get_keywords_for_hierarchy, convert_keywords_to_cluster_names

Estimate embedding:

In [7]:
from umap.parametric_umap import ParametricUMAP

pumap = ParametricUMAP()
par_embedding = pumap.fit_transform(posts_encoded)

  warn(


: 

: 

Find clusters:

In [37]:
i_graph = get_igraph_from_umap_graph(None, graph=umap_graph)

In [41]:
clustering_info = two_level_clustering(
    i_graph, posts_encoded.values, 
    top_level_resolution=0.0005, second_level_resolution=0.003, 
    min_size_level1=10, min_size_level2=10
)

len(set(clustering_info['clusters_1_level'])), len(set(clustering_info['global_numbering_clusters_2_level']))

(11, 53)

In [42]:
documents_norm = posts.text.progress_map(normalize_text_doc).values
documents = [' '.join(doc) for doc in documents_norm]

  0%|          | 0/10763 [00:00<?, ?it/s]

In [43]:
documents = [' '.join(doc) for doc in documents_norm]

In [48]:
from scripts.keyword_extraction import get_top_keywords_for_cluster

In [79]:
kws = get_top_keywords_for_cluster(
    # text_corp, clustering_info['clusters_1_level'], vectorizer.get_feature_names_out(), n_terms=5, tag_types=None
    text_corp, clustering_info['global_numbering_clusters_2_level'], vectorizer.get_feature_names_out(), n_terms=5, tag_types=None
)

# kws

In [80]:
# text_corp, vectorizer = create_text_corp(documents)
res_kw = get_keywords_for_hierarchy(clustering_info, text_corp, feature_names=vectorizer.get_feature_names_out())

0
1


In [81]:
clust_labels, clust_labels2 = convert_keywords_to_cluster_names(res_kw, clustering_info)

In [82]:
posts.head(2)

Unnamed: 0,title,htmlBody,pageUrl,postedAt,baseScore,voteCount,commentCount,meta,question,url,tags,user,coauthors,userId,body,text
2258qMLTjTu4L77Fu,Can you suggest guidelines for setting / renew...,"<p>Hi EA community,</p><p>This is my first pos...",https://forum.effectivealtruism.org/posts/2258...,2020-05-14 18:24:32.423000+00:00,10,4,,False,True,,[Career choice],Barth,[],2fpSGdf4ofpkhvKjt,"Hi EA community,\n\nThis is my first post here...",Can you suggest guidelines for setting / renew...
225Aq4P4jFPoWBrb5,Cause prioritization for downside-focused valu...,<p><em>Last updated: July 8th 2021. </em> </p>...,https://forum.effectivealtruism.org/posts/225A...,2018-01-31 14:47:11.961000+00:00,72,51,10.0,False,False,,"[Cause prioritization, Philosophy of effective...",Lukas_Gloor,[],2tRAtc3DtRKjL8hsS,Last updated: July 8th 2021. \n\nThis post out...,Cause prioritization for downside-focused valu...


In [83]:
art_df = posts[['title', 'pageUrl', 'baseScore', 'commentCount']].reset_index(drop=True).copy()

date = pd.to_datetime(posts['postedAt']).dt
art_df['date'] = date.date.values
art_df['year'] = date.year.values

art_df['text'] = posts['body'].map(lambda x: ' ' .join(x.split(' ')[:150]) + '...')
art_df['url'] = posts.pageUrl.values

# art_df['text'] = art_df.abstract.values
art_df['log_score'] = np.log10(np.abs(art_df.baseScore) + 1) * np.sign(art_df.baseScore)
art_df['log_n_comments'] = np.log10(art_df.commentCount + 1)

art_df['clust1'] = clust_labels
art_df['clust2'] = clust_labels2

res_arch = dict(
    keyword_info=res_kw,
    embedding=mnn_emb,
    art_df=art_df,
    clusters_columns=['clust1', 'clust2', 'year', 'log_score', 'log_n_comments'],
    metadata_columns=['title', 'baseScore', 'commentCount'],
    scatter_params=dict(annotation_col='clust1', ms=3.5),
    metadata=None,
)

pd.to_pickle(res_arch, "./cache/sbert_emb_data.pkl")