In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
import project_path

In [14]:
import random

import numpy as np
import optuna
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import softmax
from datasets import load_from_disk
from sentence_transformers import SentenceTransformer

from src.data_utils import load_data
from src.paths import datap, outputp
from src.training import Trainer
from src.embed_posts import get_paragraph_split
from constants import RAW_DATA_PATH, SBERT_PATH, LABEL_MAP

# Reproducibility
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [4]:
model_name = "eaembd_post_classifier:2023-04-04" # "baseline"
filename = "labeled_posts.csv"

In [5]:
db_path = outputp("db.sqlite3")
loaded_study = optuna.load_study(study_name=model_name,
                                 storage=f"sqlite:///{db_path}")

dataset_size = len(pd.read_csv(datap(filename)))
trainer = Trainer(epochs=10)

lr, batch_size = loaded_study.best_trial.params["lr"], loaded_study.best_trial.params["batch_size"]
dataset = load_data()

train_loader = DataLoader(dataset, batch_size=batch_size)
train_metrics, _ = trainer.train(lr, train_loader, train_loader, logging=True)


Epoch 1
-------------------------------
train FPR 0.192 | train recall 0.558
test FPR 0.043 | test recall 0.904

Epoch 2
-------------------------------
train FPR 0.099 | train recall 0.764
test FPR 0.024 | test recall 0.952

Epoch 3
-------------------------------
train FPR 0.049 | train recall 0.880
test FPR 0.008 | test recall 0.981

Epoch 4
-------------------------------
train FPR 0.022 | train recall 0.944
test FPR 0.007 | test recall 0.992

Epoch 5
-------------------------------
train FPR 0.016 | train recall 0.972
test FPR 0.005 | test recall 0.995

Epoch 6
-------------------------------
train FPR 0.016 | train recall 0.972
test FPR 0.004 | test recall 0.996

Epoch 7
-------------------------------
train FPR 0.008 | train recall 0.989
test FPR 0.002 | test recall 0.997

Epoch 8
-------------------------------
train FPR 0.006 | train recall 0.991
test FPR 0.002 | test recall 0.997

Epoch 9
-------------------------------
train FPR 0.008 | train recall 0.989
test FPR 0.002 | t

In [73]:
training_ids = set(dataset.ids)
data = load_from_disk(datap("posts")).to_pandas()
all_ids = set(data._id)
discover_ids = all_ids - training_ids

In [75]:
sample = data[data._id.isin(discover_ids)]
sample = get_paragraph_split(sample)
paragraph_split = pd.concat([
    pd.DataFrame({"postId": r._id, "text": r.paragraphs_split.text.values})
    for pid, r in sample.iterrows()], ignore_index=True)

model = SentenceTransformer.load(SBERT_PATH)
paragraphs_embeddings = model.encode(paragraph_split.text, batch_size=batch_size, show_progress_bar=True)
paragraph_split = pd.concat((paragraph_split, pd.DataFrame(paragraphs_embeddings)), axis=1)
embedded_sample = paragraph_split.groupby("postId").mean(numeric_only=True)

  0%|          | 0/10674 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['paragraphs'] = data.body.progress_map(extract_paragraphs)


  0%|          | 0/10674 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['paragraphs'] = data.paragraphs.progress_map(lambda p: split_long_paragraphs(p, max_n_words=MAX_TOKENS))


  0%|          | 0/10507 [00:00<?, ?it/s]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['paragraphs_split'] = data.paragraphs.progress_map(


Batches:   0%|          | 0/70643 [00:00<?, ?it/s]

In [76]:
X = torch.tensor(embedded_sample.values, dtype=torch.float32)
preds = torch.argmax(softmax(trainer.model(X), dim=1), dim=1)

In [85]:
LABEL_MAP

{'organization': 0, 'project': 1, 'other': 2}

In [86]:
orgs = torch.where(preds == LABEL_MAP['project'])

In [87]:
orgs_ids = embedded_sample.iloc[orgs].index

In [88]:
identified_orgs = data[data._id.isin(orgs_ids)]

In [89]:
pd.set_option('display.max_colwidth', None)

In [90]:
identified_orgs["title"]

39             What should my research lab focus on in the first week of 2023?
55         The Rational Utilitarian Love Movement (A Historical Retrospective)
58        A new place to discuss cognitive science, ethics and human alignment
110                                  Fund biosecurity officers at universities
112      Announcing vodle, a web app for consensus-aiming collective decisions
                                         ...                                  
10560                        How best to aggregate judgements about donations?
10650                                       $5 billion in moral trade surplus?
10663                                             TLYCS Pamphleting Pilot Plan
10711                                                 Gratipay for Funding EAs
10739                                           Anti Publication Bias Registry
Name: title, Length: 322, dtype: object

In [91]:
len(identified_orgs)

322

In [92]:
identified_orgs.to_csv("identified_projects.csv")

In [25]:
pd.set_option("display.max_colwidth", None)
df = pd.read_csv("identified_projects.csv")

In [44]:
df[df["title"].str.contains("A new place to discuss")][["_id", "title"]]

Unnamed: 0,_id,title
2,2bfYxTt2FsGXnwDyt,"A new place to discuss cognitive science, ethics and human alignment"


In [39]:
"""
projects:
nj9FLkifyb3s6Eijx -  Announcing Squigglepy, a Python package for Squiggle
tfjLzxMZYhLD9Qx2M -  Announcing vodle, a web app for consensus-aiming collective decisions
qPEmQtgnbNgmLTmi4 -  Biosecurity Dual Use Screening - Project Proposal
s3iQKqoYyXNiRDWKj -  Ballot transparency project
cMvxw4ehHJy2vYJDA -  Student project for engaging with AI alignment
cndhBdHZAGopypbso -  Wicked Problems, Understood Together
pW7w5mcbKaWGi9vez -  Victim Coordination Website
cjHGS3jYiezLcz6dh -  Accessible EA Projects
HGjMPBpAEwEZJswXK -  A list of technical EA projects
"""

'\nprojects:\nnj9FLkifyb3s6Eijx -  Announcing Squigglepy, a Python package for Squiggle\ntfjLzxMZYhLD9Qx2M -  Announcing vodle, a web app for consensus-aiming collective decisions\nqPEmQtgnbNgmLTmi4 -  Biosecurity Dual Use Screening - Project Proposal\ns3iQKqoYyXNiRDWKj -  Ballot transparency project\ncMvxw4ehHJy2vYJDA -  Student project for engaging with AI alignment\ncndhBdHZAGopypbso -  Wicked Problems, Understood Together\npW7w5mcbKaWGi9vez -  Victim Coordination Website\ncjHGS3jYiezLcz6dh -  Accessible EA Projects\nHGjMPBpAEwEZJswXK -  A list of technical EA projects\n'

In [45]:
"""
orgs:
kffXpdBgevBzK3cKB - Unjournal: Call for participants and research\n
ei3JSiYqF44Lazsbj - [Opportunity] Synthetic Biology Forecasters
me6xDoDzruPPuemQr - Centre for Exploratory Altruism Research (CEARCH)
2bfYxTt2FsGXnwDyt - A new place to discuss cognitive science, ethics and human alignment
"""

'\norgs:\nkffXpdBgevBzK3cKB - Unjournal: Call for participants and research\n\nei3JSiYqF44Lazsbj - [Opportunity] Synthetic Biology Forecasters\nme6xDoDzruPPuemQr - Centre for Exploratory Altruism Research (CEARCH)\n2bfYxTt2FsGXnwDyt - A new place to discuss cognitive science, ethics and human alignment\n'

In [None]:
# identified_orgs['tags']

In [82]:
# print(identified_orgs.iloc[0]['body'])