In [1]:
!pip install sentence_transformers

# Importing necessary libraries
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import pandas as pd
import polars as pl
from rich import print as pp
import os
from pandas import DataFrame




In [2]:
path = "../data/sample/"
files = os.listdir(path)

dfs = []

for file_name in files:
    pp(f"[blue bold]> Processing {file_name}...[/blue bold]")
    file_path = os.path.join(path, file_name)
    df_tmp = pl.read_csv(file_path).to_pandas()
    dfs += [df_tmp]

pp(f"[green bold]> Concatenating {len(dfs)} dataframes...[/green bold]")
df: DataFrame = pd.concat(
    dfs,
    ignore_index=True,
)


df.reset_index(drop=True, inplace=True)
df.drop_duplicates(inplace=True)

df.head()

  df: DataFrame = pd.concat(


Unnamed: 0,text,retweet,reply,like,quote,impression,is_reply,id,created_at,author_id,lang,text_raw,media,date,t
0,mom,41.0,6.0,95.0,0.0,0.0,False,7.282756e+17,2016-05-05 17:30:05+00:00,807095.0,en,When your mom takes over your wedding https://...,nytimes,2016-05-05 17:30:05,17
1,donald trump jan brewer,142.0,38.0,307.0,0.0,0.0,False,6.987265e+17,2016-02-14 04:32:40+00:00,807095.0,en,"Before Donald Trump, there was Jan Brewer http...",nytimes,2016-02-14 04:32:40,6
2,american soccer jordan morri ...,14.0,0.0,0.0,0.0,0.0,False,7.703848e+17,2016-08-29 22:17:03+00:00,807095.0,en,RT @NYTSports: The next big thing in American ...,nytimes,2016-08-29 22:17:03,34
3,famili win million powerb jackpot ...,105.0,0.0,0.0,0.0,0.0,False,7.313451e+17,2016-05-14 04:47:03+00:00,807095.0,en,RT @NYTMetro: A family wins a $429 million Pow...,nytimes,2016-05-14 04:47:03,19
4,mormon distast donald trump utah grab...,165.0,37.0,313.0,27.0,0.0,False,7.631597e+17,2016-08-09 23:47:02+00:00,807095.0,en,Mormons’ distaste for Donald Trump puts Utah u...,nytimes,2016-08-09 23:47:02,31


In [3]:
df["text_raw"] = df["text_raw"].astype(str).fillna("")
df["unique_id"] = df.index

In [4]:
df.to_csv("../data/process/sample.agg.csv", index=False)

In [5]:
def grab_text(t=1, df=df):
    # return a list of text for a given t
    return df.loc[df["t"] == t, "text_raw"].tolist()


# grab_text(0)

In [6]:
model = SentenceTransformer("bert-base-nli-mean-tokens")


def cal_sim_score(
    sentences: list = [
        "Three years later, the coffin was still full of Jello.",
        "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
        "The person box was packed with jelly many dozens of months later.",
        "He found a leprechaun in his walnut shell.",
    ],
    model: SentenceTransformer = model,
) -> float:
    # Encoding the sentences
    sentence_embeddings = model.encode(sentences)

    # Calculating the cosine similarity between the sentences
    sim_scores = cosine_similarity(sentence_embeddings)
    sim_scores = np.mean(sim_scores) * 100
    sim_scores = round(sim_scores, 3)
    return sim_scores


def cal_sim_score_single(
    sentences: list = [
        "Three years later, the coffin was still full of Jello.",
        "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
        "The person box was packed with jelly many dozens of months later.",
        "He found a leprechaun in his walnut shell.",
    ],
    model: SentenceTransformer = model,
) -> list:
    # Filter out non-string sentences to avoid errors
    # sentences = [s for s in sentences if isinstance(s, str)]

    # Encoding the sentences
    sentence_embeddings = model.encode(sentences)

    # Calculating the cosine similarity between the sentences
    sim_scores = cosine_similarity(sentence_embeddings)
    sim_scores = [round(np.mean(score) * 100, 6) for score in sim_scores]
    return list(sim_scores)


# Calculating the similarity scores
# cal_sim_score()
cal_sim_score_single(grab_text(1)[:5])

[53.099751, 55.060023, 51.603186, 45.022029, 48.017597]

In [7]:
ts = sorted(df["t"].unique().tolist())
ts[:3]

[0, 1, 2]

# Run

In [8]:
sim_dfs = []
save_path = "../data/process/sim/"
for t in tqdm(ts, position=0, leave=True):
    pp(f"--- Processing t={t}...")
    df_tmp = df.loc[df["t"] == t]
    ids = df_tmp.loc[df_tmp["t"] == t, "unique_id"].tolist()
    sentences = grab_text(t, df=df_tmp)
    sim_scores = cal_sim_score_single(sentences)
    sim_df = pd.DataFrame(
        {
            "unique_id": ids,
            "t": [t] * len(sentences),
            "text": sentences,
            "similarity_score": sim_scores,
        }
    )
    sim_dfs.append(sim_df)

sim_dfs_agg = pd.concat(sim_dfs)
save_path = f"{save_path}/similarity_scores.csv"
sim_dfs_agg.to_csv(save_path, index=False)

  0%|          | 0/53 [00:00<?, ?it/s]

  2%|▏         | 1/53 [00:08<07:20,  8.47s/it]

  4%|▍         | 2/53 [00:18<07:59,  9.40s/it]

  6%|▌         | 3/53 [00:28<08:03,  9.66s/it]

  8%|▊         | 4/53 [00:38<07:50,  9.61s/it]

  9%|▉         | 5/53 [00:48<08:02, 10.05s/it]

 11%|█▏        | 6/53 [00:59<08:06, 10.35s/it]

 13%|█▎        | 7/53 [01:09<07:50, 10.23s/it]

 15%|█▌        | 8/53 [01:19<07:37, 10.16s/it]

 17%|█▋        | 9/53 [01:31<07:47, 10.62s/it]

 19%|█▉        | 10/53 [01:42<07:42, 10.75s/it]

 21%|██        | 11/53 [01:53<07:30, 10.73s/it]

 23%|██▎       | 12/53 [02:03<07:09, 10.48s/it]

 25%|██▍       | 13/53 [02:12<06:44, 10.11s/it]

 26%|██▋       | 14/53 [02:21<06:26,  9.92s/it]

 28%|██▊       | 15/53 [02:31<06:12,  9.80s/it]

 30%|███       | 16/53 [02:40<06:00,  9.75s/it]

 32%|███▏      | 17/53 [02:50<05:50,  9.74s/it]

 34%|███▍      | 18/53 [03:00<05:44,  9.85s/it]

 36%|███▌      | 19/53 [03:10<05:34,  9.84s/it]

 38%|███▊      | 20/53 [03:20<05:23,  9.79s/it]

 40%|███▉      | 21/53 [03:29<05:10,  9.69s/it]

 42%|████▏     | 22/53 [03:38<04:51,  9.41s/it]

 43%|████▎     | 23/53 [03:48<04:46,  9.56s/it]

 45%|████▌     | 24/53 [03:58<04:43,  9.78s/it]

 47%|████▋     | 25/53 [04:08<04:37,  9.91s/it]

 49%|████▉     | 26/53 [04:18<04:27,  9.90s/it]

 51%|█████     | 27/53 [04:28<04:15,  9.81s/it]

 53%|█████▎    | 28/53 [04:39<04:12, 10.11s/it]

 55%|█████▍    | 29/53 [04:51<04:19, 10.83s/it]

 57%|█████▋    | 30/53 [05:05<04:26, 11.59s/it]

 58%|█████▊    | 31/53 [05:15<04:05, 11.18s/it]

 60%|██████    | 32/53 [05:25<03:46, 10.79s/it]

 62%|██████▏   | 33/53 [05:35<03:32, 10.61s/it]

 64%|██████▍   | 34/53 [05:45<03:17, 10.40s/it]

 66%|██████▌   | 35/53 [05:55<03:04, 10.26s/it]

 68%|██████▊   | 36/53 [06:04<02:50, 10.03s/it]

 70%|██████▉   | 37/53 [06:15<02:42, 10.14s/it]

 72%|███████▏  | 38/53 [06:25<02:33, 10.26s/it]

 74%|███████▎  | 39/53 [06:37<02:28, 10.61s/it]

 75%|███████▌  | 40/53 [06:47<02:19, 10.71s/it]

 77%|███████▋  | 41/53 [07:00<02:15, 11.28s/it]

 79%|███████▉  | 42/53 [07:11<02:03, 11.22s/it]

 81%|████████  | 43/53 [07:21<01:48, 10.90s/it]

 83%|████████▎ | 44/53 [07:32<01:36, 10.76s/it]

 85%|████████▍ | 45/53 [07:45<01:32, 11.60s/it]

 87%|████████▋ | 46/53 [07:56<01:19, 11.33s/it]

 89%|████████▊ | 47/53 [08:05<01:04, 10.78s/it]

 91%|█████████ | 48/53 [08:15<00:52, 10.51s/it]

 92%|█████████▏| 49/53 [08:26<00:42, 10.50s/it]

 94%|█████████▍| 50/53 [08:36<00:31, 10.44s/it]

 96%|█████████▌| 51/53 [08:46<00:20, 10.23s/it]

 98%|█████████▊| 52/53 [08:54<00:09,  9.59s/it]

100%|██████████| 53/53 [08:56<00:00, 10.13s/it]
