<a href="https://colab.research.google.com/github/s2t2/openai-embeddings-2023/blob/main/notebooks/2-embeddings/OpenAI_User_Embeddings_20230702.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook we load a previously-pulled sample of users and their tweets stored in a CSV file on drive, then request embeddings from OpenAI, and save the embeddings back to a new CSV file in drive.

## Setup

### Google Drive

In [1]:
import os
from google.colab import drive

drive.mount('/content/drive')
print(os.getcwd(), os.listdir(os.getcwd())) #> 'content', ['.config', 'drive', 'sample_data']

Mounted at /content/drive
/content ['.config', 'drive', 'sample_data']


In [2]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DATA_DIR = '/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020'
print(DATA_DIR)
assert os.path.isdir(DATA_DIR)

/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020


In [3]:
# using larger sample:
users_sample_csv_filepath = os.path.join(DATA_DIR, "users_sample_by_account_type_20230702_3000_min_10_max_30_tweets.csv")
assert os.path.isfile(users_sample_csv_filepath)

### OpenAI API Service

In [4]:
%%capture

!pip install openai

In [5]:
from getpass import getpass

OPENAI_API_KEY = getpass("Please provide your OpenAI API Key: ")
print("...", OPENAI_API_KEY[-4:])

Please provide your OpenAI API Key: ··········
... AysE



  + https://github.com/openai/openai-python
  + https://platform.openai.com/account/api-keys
  + https://platform.openai.com/docs/introduction/key-concepts
  + https://platform.openai.com/docs/models/overview
  + https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
  + https://platform.openai.com/docs/guides/embeddings/embedding-models

> We recommend using `text-embedding-ada-002` for nearly all
 (Embedding) use cases. It's better, cheaper, and simpler to use. Read the blog post announcement.


In [6]:
import openai
from openai import Model, Embedding
from pandas import DataFrame

openai.api_key = OPENAI_API_KEY

MODEL_ID = "text-embedding-ada-002"

class OpenAIService():
    def __init__(self, model_id=MODEL_ID):
        self.model_id = model_id

    def get_models(self):
        models = Model.list()
        #print(type(models)) #> openai.openai_object.OpenAIObject

        records = []
        for model in sorted(models.data, key=lambda m: m.id):
            #print(model.id, "...", model.owned_by, "...", model.parent, "...", model.object)
            model_info = model.to_dict()
            del model_info["permission"] # nested list
            #print(model_info)
            records.append(model_info)

        models_df = DataFrame(records)
        #models_df.to_csv("openai_models.csv")
        #models_df.sort_values(by=["id"])
        return models_df

    def get_embeddings(self, texts):
        """Pass in a list of strings. Returns a list of embeddings for each."""
        result = Embedding.create(input=texts, model=MODEL_ID)
        #print(len(result["data"]))
        return [d["embedding"] for d in result["data"]]


ai = OpenAIService()

In [None]:
#models_df = ai.get_models()
#models_df.head()

In [7]:
texts = [
    "I like apples, but bananas are gross.",
    "This is a tweet about bananas",
    "Drink apple juice!",
]
embeddings = ai.get_embeddings(texts)
print(len(embeddings))
print(len(embeddings[0]))

3
1536


In [8]:
from pandas import Series

Series(embeddings, index=texts)

I like apples, but bananas are gross.    [-0.00504452595487237, -0.01719384267926216, 0...
This is a tweet about bananas            [-0.014693296514451504, -0.019133443012833595,...
Drink apple juice!                       [0.004055480472743511, -0.01426590234041214, 0...
dtype: object

## Users Sample

Fetch a sample of users. Use the balanced sample we already prepared.



In [9]:
from pandas import read_csv

def remove_delimeters(txt):
    return txt.replace(" || ", " ")

users_df = read_csv(users_sample_csv_filepath)
users_df.drop(columns=["Unnamed: 0"], inplace=True, errors="ignore")
users_df.index = users_df["user_id"]
# remove delimeters inserted during the data export process:
users_df["tweet_texts"] = users_df["tweet_texts"].apply(remove_delimeters).tolist()

print(len(users_df))
print(users_df.columns)
users_df.iloc[0]

3000
Index(['user_id', 'created_on', 'screen_name_count', 'screen_names',
       'status_count', 'rt_count', 'rt_pct', 'opinion_community', 'is_bot',
       'is_q', 'profile_descriptions', 'tweet_texts'],
      dtype='object')


user_id                                                824346638391136271
created_on                                                     2017-01-25
screen_name_count                                                       1
screen_names                                                 ROGUEBARKEEP
status_count                                                           25
rt_count                                                                5
rt_pct                                                                0.2
opinion_community                                                       0
is_bot                                                              False
is_q                                                                False
profile_descriptions    TRAINER OF CIRCUS SHRIMP - LOVER OF @PIZZAPUFF...
tweet_texts             @realDonaldTrump THIS IS WHY WE NEEDED IMPEACH...
Name: 824346638391136271, dtype: object

In [None]:
#users_df["profile_descriptions"].tolist()[0:5]

In [None]:
#users_df["tweet_texts"].tolist()[0:5]

## Embeddings

NOTE: Series is not JSON serializable, so we will request the embeddings for a list of strings instead.

Also we will need to mind the rate limits, by processing the embeddings in batches.

In [16]:
def split_into_batches(my_list, batch_size=10_000):
    """Splits a list into evenly sized batches"""
    # h/t: https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    for i in range(0, len(my_list), batch_size):
        yield my_list[i : i + batch_size]


### Tweet Embeddings

In [13]:
tweets_df =  users_df[users_df["tweet_texts"].notnull()][["user_id", "tweet_texts"]]

In [14]:
tweet_texts = tweets_df["tweet_texts"].tolist()
print("TWEET TEXTS:", len(tweet_texts))
print(tweet_texts[0:5])

TWEET TEXTS: 3000


In [33]:
#tweet_embeddings = ai.get_embeddings(tweet_texts) # API CALL
#print("TWEET EMBEDDINGS:", len(tweet_embeddings))

# RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-...
# on tokens per min. Limit: 1_000_000 / min.
# Current: 1 / min.
# Contact us through our help center at help.openai.com if you continue to have issues.

from time import sleep

# we were able to process around 300 users previously, so let's use size around there
# 250 per batch should take 12 batches / 12 minutes
tweet_embeddings = []
for tweets_batch in split_into_batches(tweet_texts, batch_size=250):
    print(len(tweets_batch))
    embeds_batch = ai.get_embeddings(tweets_batch)
    tweet_embeddings += embeds_batch
    sleep(60) # wait for a minute before requesting the next batch

250
250
250
250
250
250
250
250
250
250
250
250


In [38]:
tweets_df["embeddings"] = tweet_embeddings

### Profile Embeddings

FYI - Not all users have profile texts.


We will need to remove the nans before processing. And outer join them back together with the tweet embeddings at the end.

In [None]:
#users_df["profile_descriptions"].isna()
#users_df["profile_descriptions"].notnull()

In [39]:
profiles_df = users_df[users_df["profile_descriptions"].notnull()][["user_id", "profile_descriptions"]]
#profiles.head()

In [40]:
profile_texts = profiles_df["profile_descriptions"].tolist()
print("PROFILE TEXTS:", len(profile_texts))
print(profile_texts[0:5])

PROFILE TEXTS: 2379
['TRAINER OF CIRCUS SHRIMP - LOVER OF @PIZZAPUFFS | JUST HERE FOR THE GOP THOUGHTS AND PRAYERS!', 'RETIRED PHYSICIAN, COMMITTED LIFELONG PROGRESSIVE, THINKS OUTSIDE THE BOX. DAD TO GIRLS, POPPY, THE BIRDMAN. MOVED NORTH FROM FLORIDA.', '"WHO THE HELL CAN SEE FOREVER?"', 'JUST A GUY. BASEBALL AND VIDEO GAMES ARE MY THINGS. #FEARINOCULUM IS CURRENTLY ON REPEAT IN MY HOUSEHOLD.', 'JUST LIB']


In [41]:
#profile_embeddings = ai.get_embeddings(profile_texts) # API CALL
#print("PROFILE EMBEDDINGS:", len(profile_embeddings))
#print(len(profile_embeddings[0]))
#print(profile_embeddings[0])

profile_embeddings = []
for profiles_batch in split_into_batches(profile_texts, batch_size=250):
    print(len(profiles_batch))
    embeds_batch = ai.get_embeddings(profiles_batch)
    profile_embeddings += embeds_batch
    sleep(60) # wait for a minute before requesting the next batch

250
250
250
250
250
250
250
250
250
129


In [42]:
profiles_df["embeddings"] = profile_embeddings

In [None]:
#profiles_df["embeddings"].iloc[0]

## Save Embeddings

In [None]:
embeds_df = users_df.merge(profiles_df["embeddings"], left_index=True, right_index=True)
embeds_df.rename(columns={"embeddings": "profile_embeddings"}, inplace=True)

embeds_df = embeds_df.merge(tweets_df["embeddings"], left_index=True, right_index=True)
embeds_df.rename(columns={"embeddings": "tweet_embeddings"}, inplace=True)

#embeds_df.head()

In [45]:
model_dirpath = os.path.join(DATA_DIR, MODEL_ID)
os.makedirs(model_dirpath, exist_ok=True)

embeddings_csv_filepath = os.path.join(model_dirpath, "users_sample_openai_embeddings_20230702_3000_min_10_max_30.csv")
print(embeddings_csv_filepath)
embeds_df.to_csv(embeddings_csv_filepath)

/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020/text-embedding-ada-002/users_sample_openai_embeddings_20230702_3000_min_10_max_30.csv
