In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0])))

import matplotlib.pyplot as plt
import numpy as np
import umap
import pandas as pd
from datasets import Dataset

from src.paths import get_project_root, abs_path, datap

sys.path.append(abs_path("TextClusterVisualization/scripts"))
os.makedirs(datap(), exist_ok=True)

from graph_clustering import get_igraph_from_umap_graph, two_level_clustering
from preprocessing import create_text_corp
from keyword_extraction import get_keywords_for_hierarchy, convert_keywords_to_cluster_names

# %env WANDB_NOTEBOOK_NAME=prepare_data_clean
import wandb
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
model_name = "all-mpnet-base-v2"

In [3]:
wandb.login() # relies on WANDB_API_KEY env var
run = wandb.init(
    project="ea-forum-analysis", job_type="processing", dir=get_project_root(), config={'model_name': model_name}
)

[34m[1mwandb[0m: Currently logged in as: [33mvpetukhov[0m. Use [1m`wandb login --relogin`[0m to force relogin


Ideas:
- Timeline with key posts on a topic
  - Add time dimension to my search engine?
- Convert dendrogram into an actual table of content
- Propagate post tags to users, show most active users per dendrogram branch
- Improve coloschemes
- Add time selection

Improving visualization:
- Try poincare embeddings
- Try sentence transformers instead of word2vec

## Load data

In [4]:
enc_art = run.use_artifact(f"posts_encoded:{model_name}")
enc_art.download()
posts_art = run.use_artifact("posts_raw:latest")

posts_encoded = pd.read_csv(enc_art.file(), index_col=0)
posts = Dataset.load_from_disk(posts_art.download()).to_pandas()

run.config.update({'encoding_version': enc_art.version, 'data_version': posts_art.version})

[34m[1mwandb[0m: Downloading large artifact posts_encoded:all-mpnet-base-v2, 98.61MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.0
[34m[1mwandb[0m: Downloading large artifact posts_raw:latest, 272.30MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.0


In [5]:
posts = posts.loc[posts_encoded.index]
posts = posts[posts.postedAt.dt.year > 2009]
posts_encoded = posts_encoded.loc[posts.index]

posts_encoded.shape

(10827, 768)

In [6]:
posts['text'] = posts['title'] + "\n\n" + posts['body']

## Prepare data

### Estimate embedding

In [7]:
from umap.parametric_umap import ParametricUMAP

pumap = ParametricUMAP()
par_embedding = pumap.fit_transform(posts_encoded)

  warn(


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Find clusters

In [8]:
i_graph = get_igraph_from_umap_graph(pumap)

clustering_info = two_level_clustering(
    i_graph, posts_encoded.values, 
    top_level_resolution=0.0005, second_level_resolution=0.003, 
    min_size_level1=10, min_size_level2=10
)

len(set(clustering_info['clusters_1_level'])), len(set(clustering_info['global_numbering_clusters_2_level']))

(15, 64)

### Find keywords

In [9]:
%time text_corp, vectorizer = create_text_corp(posts.text.values)

CPU times: user 1min 44s, sys: 112 ms, total: 1min 44s
Wall time: 1min 44s


In [10]:
res_kw = get_keywords_for_hierarchy(clustering_info, text_corp, feature_names=vectorizer.get_feature_names_out())

In [11]:
clust_labels, clust_labels2 = convert_keywords_to_cluster_names(res_kw, clustering_info)

## Save data

In [12]:
art_df = posts[['title', 'pageUrl', 'baseScore', 'commentCount']].reset_index(drop=True).copy()

date = pd.to_datetime(posts['postedAt']).dt
art_df['date'] = date.date.values
art_df['year'] = date.year.values

tag_string = posts['tags'].map(lambda x: "*" + "; ".join(x[:5]) + "*")
art_df['text'] = tag_string + "\n\n" + posts['body'].map(lambda x: ' ' .join(x.split(' ')[:150]) + '...')
art_df['url'] = posts.pageUrl.values

art_df['log_score'] = np.log10(np.abs(art_df.baseScore) + 1) * np.sign(art_df.baseScore)
art_df['log_n_comments'] = np.log10(art_df.commentCount + 1)

art_df['clust1'] = clust_labels
art_df['clust2'] = clust_labels2

res_arch = dict(
    keyword_info=res_kw,
    embedding=par_embedding,
    art_df=art_df,
    clusters_columns=['clust1', 'clust2', 'year', 'log_score', 'log_n_comments'],
    metadata_columns=['title', 'baseScore', 'commentCount'],
    scatter_params=dict(annotation_col='clust1', ms=3.5),
    metadata=None,
)

pd.to_pickle(res_arch, datap("sbert_emb_data.pkl"))

In [13]:
art = wandb.Artifact("sbert_emb_data", type="dataset", metadata={'model_name': model_name})
art.add_file(datap("sbert_emb_data.pkl"))
run.log_artifact(art, aliases=[model_name])

<wandb.sdk.wandb_artifacts.Artifact at 0x7f19c8c1c100>

In [14]:
run.finish()