In [1]:
from google.cloud import storage
import pandas as pd
import json
import hashlib

In [2]:
def read_gcs_file(bucket_name, blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    file_content = blob.download_as_text()
    return file_content

In [3]:
def check_blob_exists(bucket_name, blob_name):
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)
    return blob.exists()

In [4]:
def get_user_interactions(user_id):
    bucket_name = "run-sources-long-term-interaction-us-east5"
    interactions_file = f"interactions_{user_id}.json"
    redactions_file = f"redactions_{user_id}.json"
    
    interactions = read_gcs_file(bucket_name, interactions_file)
    interactions = [json.loads(line) for line in interactions.strip().split('\n')]
    interactions = pd.DataFrame(interactions)
    
    redactions = None
    if check_blob_exists(bucket_name, redactions_file):
        redactions = read_gcs_file(bucket_name, redactions_file)
        redactions = [json.loads(line) for line in redactions.strip().split('\n')]
        redactions = pd.DataFrame(redactions)
        redactions["message_idx"] = ((redactions["message_idx"]-1)/2).astype(int)
        redactions = redactions["message_idx"].to_list()
    
        interactions = interactions.drop(redactions)
        interactions = interactions.reset_index(drop=True)
    
    interactions["input"] = interactions["input"].str.strip()
    interactions["output"] = interactions["output"].str.strip()
    interactions["user_id"] = user_id
    return interactions

In [5]:
def get_user_id(email):
    hash_obj = hashlib.sha256(email.encode('utf-8'))
    return hash_obj.hexdigest()[:10]

In [6]:
df = pd.read_csv("../data/participants.csv")
df["user_id"] = [get_user_id(email) for email in df["google_email"].to_list()]

In [7]:
interactions = []
for user_id in df["user_id"]:
    interactions.append(get_user_interactions(user_id))
interactions = pd.concat(interactions)
interactions.to_csv("../data/interactions.csv", index=False)