In [None]:
import pandas as pd
import numpy as np

In [None]:
train_path = 'final_data/train_df.pkl'
train_df = pd.read_pickle(train_path)

test_path = 'final_data/test_df.pkl'
test_df = pd.read_pickle(test_path)

train_df.shape, test_df.shape

In [None]:
API_KEY = 'INSERT_API_KEY'

In [None]:
import random

# Group by ID, MatchID, and PeriodID
grouped_train = train_df.groupby([ "MatchID", "PeriodID", "ID"]).agg(
    EventType=("EventType", "first"),
    Tweets=("Tweet", " ".join),  # Concatenate tweets
    Tweet_Count=("Tweet", "count"),          # Count number of tweets
).reset_index()



In [None]:
# Group by ID, MatchID, and PeriodID
grouped_test = test_df.groupby([ "MatchID", "PeriodID", "ID",]).agg(
    # EventType=("EventType", "first"),
    Tweets=("Tweet", " ".join),  # Concatenate tweets
    Tweet_Count=("Tweet", "count"),          # Count number of tweets
).reset_index()


In [None]:
from sklearn.model_selection import train_test_split

# Extract unique MatchIDs
unique_match_ids = grouped_train["MatchID"].unique()

# Split MatchIDs into train and validation sets
train_ids, valid_ids = train_test_split(
    unique_match_ids,
    test_size=0.1,
    random_state=666
)

# Filter the train and validation datasets
train_df = grouped_train[grouped_train["MatchID"].isin(train_ids)].reset_index(drop=True)
val_df = grouped_train[grouped_train["MatchID"].isin(valid_ids)].reset_index(drop=True)
test_df = grouped_test
train_df.shape, val_df.shape, test_df.shape

In [None]:
import google.generativeai as genai

genai.configure(api_key=API_KEY)

In [None]:
for m in genai.list_models():
  if 'embedContent' in m.supported_generation_methods:
    print(m.name)

In [None]:

from tqdm.auto import tqdm
tqdm.pandas()

from google.api_core import retry

def make_embed_text_fn(model):

  @retry.Retry(timeout=300.0)
  def embed_fn(text: str) -> list[float]:
    # Set the task_type to CLASSIFICATION.
    embedding = genai.embed_content(model=model,
                                    content=text,
                                    task_type="classification")
    return embedding['embedding']

  return embed_fn

def create_embeddings(model, df):
  df['Embeddings'] = df['Tweets'].progress_apply(make_embed_text_fn(model))
  return df

In [None]:
model = 'models/embedding-001'

df_train = create_embeddings(model, train_df)
df_val = create_embeddings(model, val_df)
df_test = create_embeddings(model, test_df)

In [None]:
df_train.to_csv('gemini_embeds/train_embeds.csv', index=False)
df_val.to_csv('gemini_embeds/val_embeds.csv', index=False)
df_test.to_csv('gemini_embeds/test_embeds.csv', index=False)