In [23]:
from datasets import load_dataset
import pandas as pd
import os
from dotenv import load_dotenv
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import pandas
import openai
from openai import APIError
import os
import json
import re
import numpy as np
from sklearn.cluster import KMeans
from pprint import pprint
from pathlib import Path
import tiktoken
from sentence_transformers import SentenceTransformer


# Load the .env file
load_dotenv()

# Access the variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai.api_key = OPENAI_API_KEY
pd.set_option('display.max_colwidth', None)  # None means no limit on column width
pd.set_option('display.width', 300)  

In [4]:
df = pd.read_parquet('datasets/STS.parquet')

In [5]:
df

Unnamed: 0,split,genre,dataset,year,sid,score,sentence1,sentence2
0,test,main-captions,MSRvid,2012test,0024,2.5,A girl is styling her hair.,Devojčica češlja kosu.
1,test,main-captions,MSRvid,2012test,0033,3.6,A group of men play soccer on the beach.,Grupa dečaka igra fudbal na plaži.
2,test,main-captions,MSRvid,2012test,0045,5.0,One woman is measuring another woman's ankle.,Jedna žena meri gležanj druge žene.
3,test,main-captions,MSRvid,2012test,0063,4.2,A man is cutting up a cucumber.,Čovek secka krastavac.
4,test,main-captions,MSRvid,2012test,0066,1.5,A man is playing a harp.,Čovek svira klavijaturu.
...,...,...,...,...,...,...,...,...
1374,test,main-news,headlines,2016,1354,0.0,"Philippines, Canada pledge to further boost relations",Philippines saves 100 after ferry sinks
1375,test,main-news,headlines,2016,1360,1.0,Israel bars Palestinians from Jerusalem's Old City,"Two-state solution between Palestinians, Israel pie in sky"
1376,test,main-news,headlines,2016,1368,1.0,How much do you know about Secret Service?,Lawmakers from both sides express outrage at Secret Service
1377,test,main-news,headlines,2016,1420,0.0,Obama Struggles to Soothe Saudi Fears As Iran Talks Resume,Myanmar Struggles to Finalize Voter Lists for Sunday Polls


In [6]:
df.shape

(1379, 8)

In [7]:
from datasets import Dataset
def get_sts_dataset():
    df = pd.read_parquet("datasets/STS.parquet")
    data_dict = {"sentence1": [], "sentence2": [], "score": []}
    for _, row in df.iterrows():
        data_dict["sentence1"].append(row["sentence1"])
        data_dict["sentence2"].append(row["sentence2"])
        data_dict["score"].append(row["score"] / 5.0)
    # Create a Hugging Face Dataset
    return Dataset.from_dict(data_dict)

In [8]:
sts = get_sts_dataset()

In [9]:
sts

Dataset({
    features: ['sentence1', 'sentence2', 'score'],
    num_rows: 1379
})

In [11]:
pd.DataFrame(sts)

Unnamed: 0,sentence1,sentence2,score
0,A girl is styling her hair.,Devojčica češlja kosu.,0.50
1,A group of men play soccer on the beach.,Grupa dečaka igra fudbal na plaži.,0.72
2,One woman is measuring another woman's ankle.,Jedna žena meri gležanj druge žene.,1.00
3,A man is cutting up a cucumber.,Čovek secka krastavac.,0.84
4,A man is playing a harp.,Čovek svira klavijaturu.,0.30
...,...,...,...
1374,"Philippines, Canada pledge to further boost relations",Philippines saves 100 after ferry sinks,0.00
1375,Israel bars Palestinians from Jerusalem's Old City,"Two-state solution between Palestinians, Israel pie in sky",0.20
1376,How much do you know about Secret Service?,Lawmakers from both sides express outrage at Secret Service,0.20
1377,Obama Struggles to Soothe Saudi Fears As Iran Talks Resume,Myanmar Struggles to Finalize Voter Lists for Sunday Polls,0.00


In [26]:
teacher_model_name = "paraphrase-distilroberta-base-v2"
teacher_model = SentenceTransformer(teacher_model_name)
inference_batch_size = 64 

In [14]:
def convert_to_hf_dataset(dataframe: pandas.DataFrame) -> Dataset:
    # Convert each InputExample into a dictionary
    data_dict = {"english": [], "non_english": []}
    for _, row in dataframe.iterrows():
        data_dict["english"].append(row["sentence1"])
        data_dict["non_english"].append(row["sentence2"])
    # Create a Hugging Face Dataset
    return Dataset.from_dict(data_dict)

In [17]:
def load_pandas_df(file: Path) -> pandas.DataFrame:
    loaded_table = pq.read_table(file)
    return loaded_table.to_pandas()

In [25]:
from typing import Tuple
from sklearn.model_selection import train_test_split


def get_train_and_eval_datasets(dataset_name: Path) -> Tuple[Dataset, Dataset]:
    df = load_pandas_df(file=dataset_name)
    train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)
    # sanity_check(train_df, eval_df)
    # Convert lists to Hugging Face Datasets
    train_dataset = convert_to_hf_dataset(train_df)
    eval_dataset = convert_to_hf_dataset(eval_df)

    return train_dataset, eval_dataset

In [27]:
def prepare_dataset(batch):
    return {
        "english": batch["english"],
        "non_english": batch["non_english"],
        "label": teacher_model.encode(
            batch["english"], batch_size=inference_batch_size, show_progress_bar=False
        ),
    }


train_dataset, eval_dataset = get_train_and_eval_datasets(dataset_name="datasets/STS.parquet")
column_names = train_dataset.column_names

train_dataset_dict = train_dataset.map(
    prepare_dataset, batched=True, batch_size=30000, remove_columns=column_names
)
eval_dataset = eval_dataset.map(
    prepare_dataset, batched=True, batch_size=30000, remove_columns=column_names
)

Map:   0%|          | 0/1103 [00:00<?, ? examples/s]

Map:   0%|          | 0/276 [00:00<?, ? examples/s]

In [28]:
train_dataset_dict

Dataset({
    features: ['english', 'non_english', 'label'],
    num_rows: 1103
})

In [32]:
df = pd.DataFrame(train_dataset_dict)
for i in df['label']:
    if type(i) == None:
        print(i)

In [29]:
eval_dataset

Dataset({
    features: ['english', 'non_english', 'label'],
    num_rows: 276
})