In [54]:
import project_path

%load_ext autoreload
%autoreload 2

import pandas as pd
from pandas import DataFrame
from tqdm.auto import tqdm

import nltk
nltk.download('punkt')

from datasets import load_from_disk, Dataset
from sentence_transformers import SentenceTransformer

from src.text_split import extract_paragraphs, split_long_paragraphs, collapse_paragraphs_iteratively
from src.paths import datap

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package punkt to /home/justy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
tqdm.pandas()

In [112]:
path = "/home/justy/workspace/ea/ea-forum-announcements/data/labeled_posts"
data = load_from_disk(path).to_pandas()
data.shape

(320, 19)

### Split by paragraph

In [114]:
max_n_words = 400
data['paragraphs'] = data.body.progress_map(extract_paragraphs)
data['paragraphs'] = data.paragraphs.progress_map(lambda p: split_long_paragraphs(p, max_n_words=max_n_words))
data = data[~data.apply(lambda x: x.paragraphs.empty, axis=1)]
data['paragraphs_split'] = data.paragraphs.progress_map(lambda x: collapse_paragraphs_iteratively(x, max_n_words=max_n_words))

  0%|          | 0/320 [00:00<?, ?it/s]

  0%|          | 0/320 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['paragraphs_split'] = data.paragraphs.progress_map(lambda x: collapse_paragraphs_iteratively(x, max_n_words=max_n_words))


In [116]:
paragraph_split = pd.concat([
    DataFrame({"postId": r._id, "text": r.paragraphs_split.text.values, "label": r.label})
    for pid,r in data.iterrows()
], ignore_index=True)
paragraph_split.shape

(2051, 3)

In [119]:
model_path = "/home/justy/workspace/ea/ea-forum-announcements/models/sbert_v1/sbert:v1"
model = SentenceTransformer.load(model_path)

In [120]:
paragraphs_embeddings = model.encode(paragraph_split.text, batch_size=1, show_progress_bar=True)

Batches:   0%|          | 0/2051 [00:00<?, ?it/s]

In [None]:
paragraph_split = pd.concat((paragraph_split, pd.DataFrame(paragraphs_embeddings)), axis=1)

In [122]:
paragraph_split.shape

(2051, 771)

In [124]:
dataset = paragraph_split.groupby("postId").mean(numeric_only=True)
dataset.shape

(312, 769)

In [126]:
dataset.label = dataset.label.astype("int8")

In [132]:
dataset = dataset.sample(frac=1)

In [133]:
dataset.to_csv(datap("labeled_posts_embedded.csv"))

### No preprocessing

In [19]:
dataset = data[["body", "label"]]
dataset.head()

Unnamed: 0,body,label
0,\n\nOur programs exist to have a positive impa...,2.0
1,\n\n## The most important century is the one w...,2.0
2,\n\nMeet us at the Karbach Biergarten for an I...,2.0
3,"\n\nDisclaimer: We (Sam Nolan, Hannah Rokebran...",2.0
4,"\n\nAt Founders Pledge, we just launched a new...",0.0


In [13]:
dataset = dataset.rename({"body": "text"}, axis=1)

In [15]:
dataset = Dataset.from_pandas(dataset).train_test_split(test_size=0.5)

In [16]:
dataset['train'].save_to_disk(datap("all_paragraphs_labeled_only/train"))

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
dataset['test'].save_to_disk(datap("all_paragraphs_labeled_only/test"))

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

### Choose only the first paragraph

In [82]:
first_paragraphs = paragraph_split.drop_duplicates(subset="postId", keep="first")
first_paragraphs.head()

Unnamed: 0,postId,text,label
0,70,Meet us at the Karbach Biergarten for an Intro...,2.0
1,182,"At Founders Pledge, we just launched a new add...",0.0
4,255,"Today we are both launching our organization, ...",0.0
5,274,Summary\nWe’re excited to announce VIVID - a n...,0.0
11,317,We are pleased to introduce Cause Innovation B...,1.0


In [83]:
first_paragraphs.label.value_counts()

0.0    41
1.0    32
2.0    22
Name: label, dtype: int64

In [84]:
first_paragraphs.loc[:, "label"] = first_paragraphs.label.astype("int8")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_paragraphs.loc[:, "label"] = first_paragraphs.label.astype("int8")
  first_paragraphs.loc[:, "label"] = first_paragraphs.label.astype("int8")


In [85]:
first_paragraphs.shape

(95, 3)

In [86]:
first_paragraphs = Dataset.from_pandas(first_paragraphs).train_test_split(test_size=0.5)

In [87]:
first_paragraphs

DatasetDict({
    train: Dataset({
        features: ['postId', 'text', 'label', '__index_level_0__'],
        num_rows: 47
    })
    test: Dataset({
        features: ['postId', 'text', 'label', '__index_level_0__'],
        num_rows: 48
    })
})

In [88]:
first_paragraphs['train'].save_to_disk(datap("first_paragraphs_labeled_only/train"))

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [89]:
first_paragraphs['test'].save_to_disk(datap("first_paragraphs_labeled_only/test"))

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [67]:
# Dataset.from_pandas(first_paragraphs).save_to_disk(datap("first_paragraphs_labeled_only"))

In [10]:
# pars_encoded = model.encode(par_split_df.text, show_progress_bar=True)

In [None]:
# DataFrame(pars_encoded).groupby(par_split_df.postId.values).mean().to_csv(datap("posts_encoded.csv"))