In [2]:
import pandas as pd
import numpy as np
import openai
import json

# # our groupings
# seed_hashtags = pd.read_csv("community-grouping_20240809.csv")
# # all hashtags double hit
# dat = pd.read_csv('double_hits_edges_no_dupes.csv')

# open the secrets file
with open('../data/secrets.json') as f:
    secrets = json.load(f)

api_key = secrets['OPENAI_API_KEY_SR']

client = openai.Client(api_key=api_key)

In [1]:
# check rate limits
def check_rate_limits(api_key):
    openai.api_key = api_key

    try:
        # Make a chat completion API call with raw response
        api_response = client.chat.completions.with_raw_response.create(
            model="gpt-4o",  
            messages=[
                {
                    "role": "user",
                    "content": "Hello, this is a test message to check rate limits.",
                }
            ],
        )

        # Access the headers
        headers = api_response.headers

        # Print rate limit information
        print("Rate Limit Information:")
        print(f"Remaining Requests: {headers.get('x-ratelimit-remaining-requests')}")
        print(f"Remaining Tokens: {headers.get('x-ratelimit-remaining-tokens')}")
        print(f"Reset Tokens: {headers.get('x-ratelimit-reset-tokens')}")
        print(f"Reset Requests: {headers.get('x-ratelimit-reset-requests')}")

        # Print the response content
        response = api_response.parse()
        print(f"\nResponse content: {response.choices[0].message.content}")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

In [20]:
check_rate_limits(api_key)

Rate Limit Information:
Remaining Requests: 9999
Remaining Tokens: 1999970
Reset Tokens: 0s
Reset Requests: 6ms

Response content: Hello! It seems like your message was received clearly. Is there anything specific you need help with today? If you're testing rate limits, feel free to send additional messages, and I will do my best to respond promptly.


---
## IRR reliability with human annotation

In [4]:
def convert_to_tags(input_string):
    # Split the input string into lines
    lines = input_string.strip().split('\n')
    
    # Process each line
    tags = []
    for line in lines:
        # Split each line into words
        words = line.split()
        # Add each word as a tag
        tags.extend(f'{word}' for word in words)
    
    # # Join the tags with commas
    # result = ', '.join(tags)
    
    # Count the total number of tags
    tag_count = len(tags)
    
    return tags, tag_count

# Example input
input_string = """funny	love	depressionandanxiety	alcohol	weed	cognitiveenhancers	opioids	smoking	harmreductionsaveslives	vitamins	fyp	stoned	vaping	kensingtonphilly	medstudent	lgbtqia
addictionhumor	hope	addiction	beers	pot	modafinil	opiates	drinking	soberlife	creatine	greenscreen	high	nicotine	vancouverbc	nurselife	transgender
meme	struggle	opioidaddiction	alchohol	cannabis	nootropics	heroin	drink	druglaws	magnesium	tiktok	wasted	ecig	bayarea	exprostitute	lgbtqtiktok
	kindness	substanceabuse	vodka	greens	nootrostatic	fent	shmokesumthin	drugpolicy	magnesiumglycinate	viral	blackedout	ecigs	toronto	medicalstudent	queergirl
	lovemyself	substanceusedisorder	whiskey	stonersoftiktok	smartdrugs	fentfriday	shmoke	pwud	vitamink	trending	highasfuck	juul	philly	healthcareworkers	queertiktok
	happytobealive	mentalhealth	cocktails	stonervibes	smartpills	xans	shmoketok	pwuds	tylenol	capcut	haf	cigarettes	midwest	socialworkersoftiktok	feralqueer
	trauma	chronicpain	wine	stoner	cerebrolysin	xanny	keepsmokingtheherb	opioidawareness	disprin	duet	shitfaced	vaper	boston	socialworker	whitegirl
	loveyall	chronicpainsufferers	liquor	laganjaestranja		xantok	drinkingtools	opioidcrisis	ayurveda	stitch	tipsy	cigarette	sanfrancisco	frontlineworkers	transtok
	vibes	headache	tequila	420vibes		xandemic	injection	overdoseawareness	tylenoltuesday	fyp„Ç∑„Çöviral	drunk	vape	usa	matnurse	indian
	happy	migraine	booze	dabs		m0lly	injectinguser	overdosecrisis	ibuprofen	fyp„Ç∑	hammered	cig	uk	streetnurse	gaygirl
	gratitude	addicted	chugbeer	wax		molly	inhaler	harmreduction	suplements	foryou	blackout	cigs	canada	nursetiktok	
	fear	bpd	beerfunnel	shmeeds		percs	inhaling	harmreductionworks	coughsyrup	foryoupage	hightimes	tobacco		therapist	
	makesomeonesmile	bpdtiktok	drinkinggames			percoset	keepsmoking	harmreductiontips	overthecounter	foryourpage	gurning	nicotinefein			
		addictionisreal	wildmandrinking			perc30s	inhalation	narcan	ibuprofeno	xyzbca	gurner	nicotinegum			
		ptsd				lean		narcansaveslives	collagen	trend	drunkaf	nicorettepatch			
		depression				leantok		naloxone							
		anxiety				acid		naloxonesaveslives							
		chronicpainwarrior				mescaline		safesupply							
		allergy				pinger		endoverdose							
		infection				pingertok		overdoseresponse							
		allergies				pingtok		stopsmoking							
		seizures				pingerzzzz		smokingkills	addictionhumor
drughumor
recoveryhumor						
		epilespy				emma		quitnicotine							
		insomnia				drank		stopvapping							
		fever				thatdrank		quitvapping							
		cough				bars		sobriety							
		adtiction				30s		sober							
		asthma				40s		soberliving							
		sorethroat				512s		cleanandsober							
		flu				yellows		recovery							
		fluseason				biak		onedayatatime							
		bronchitis				epills		odaat							
		pain				psychedelictok		na							
		inhalantsaddiction				lsdart		detox							
						magicmushroomsadventures		sobrietybirthday							
						libcaps		soberhouse							
						libertycapss		wedorecover							
						thom		relapsehappens							
						nosebeers		methadoneclinic							
						c0deine		naltrexone							
						inhalants		drugrehab							
						whippets		dropthosedates							
						whipit		matsaves							
						balloons		stopthestigma							
						adderral		alcoholfreejourney							
						addy		alcoholic							
								alcoholism																								"""

# Convert and print the result
all_human_tags, count = convert_to_tags(input_string)
print("Converted tags:")
print(all_human_tags)
print(f"\nTotal number of tags: {count}")

Converted tags:
['funny', 'love', 'depressionandanxiety', 'alcohol', 'weed', 'cognitiveenhancers', 'opioids', 'smoking', 'harmreductionsaveslives', 'vitamins', 'fyp', 'stoned', 'vaping', 'kensingtonphilly', 'medstudent', 'lgbtqia', 'addictionhumor', 'hope', 'addiction', 'beers', 'pot', 'modafinil', 'opiates', 'drinking', 'soberlife', 'creatine', 'greenscreen', 'high', 'nicotine', 'vancouverbc', 'nurselife', 'transgender', 'meme', 'struggle', 'opioidaddiction', 'alchohol', 'cannabis', 'nootropics', 'heroin', 'drink', 'druglaws', 'magnesium', 'tiktok', 'wasted', 'ecig', 'bayarea', 'exprostitute', 'lgbtqtiktok', 'kindness', 'substanceabuse', 'vodka', 'greens', 'nootrostatic', 'fent', 'shmokesumthin', 'drugpolicy', 'magnesiumglycinate', 'viral', 'blackedout', 'ecigs', 'toronto', 'medicalstudent', 'queergirl', 'lovemyself', 'substanceusedisorder', 'whiskey', 'stonersoftiktok', 'smartdrugs', 'fentfriday', 'shmoke', 'pwud', 'vitamink', 'trending', 'highasfuck', 'juul', 'philly', 'healthcarewo

In [5]:
import random
import ast

# Read the CSV file
df = pd.read_csv('../data/double_hits_filtered_upper75_nodes.csv')

# Extract the 'node' column and get the count
node_list = df['node'].dropna().tolist()
node_count = len(node_list)
print(f"Total count of nodes: {node_count}")

# Remove occurrences of all_human_tags from node_list
node_list = [node for node in node_list if node not in all_human_tags]

# Get the count of remaining nodes
node_count = len(node_list)
print(f"Total count of nodes after removal: {node_count}")

# Use all remaining nodes instead of random sampling
print(f"Remaining nodes: {node_list}")

Total count of nodes: 14569
Total count of nodes after removal: 14323


In [4]:
# test new function
# from get_themes import get_theme
# import random
# hashtag_sample = random.sample(node_list, 100)
# hashtag_sample_str = ", ".join(hashtag_sample)

In [6]:
# hashtag_sample_df = pd.DataFrame(hashtag_sample)
# hashtag_sample_df.to_csv("../data/sample500_batch_4o.csv")

In [14]:
# Get themes for the sampled hashtags
# gpt4o : gpt-4o-2024-05-13
# gpt4o-mini: gpt-4o-mini-2024-07-18
# themes = get_theme(hashtag_sample_str, model= "gpt-4o-2024-05-13", client= client)

In [15]:
# print(themes)

{
            "emotions and feelings": ["sorrytomyself", "lifeafterloss", "friendsforever", "prayersneeded", "spiritualawakening", "mydad", "happythursday", "teenagelife", "dailymotivation", "bestfriend", "again", "lossofaparent", "overcomingobstacles"],
            "health conditions": ["hashimotos", "hepatitisc", "addicitonisreal", "menshealth"],
            "alcohol": ["🍻", "smashed"],
            "cannabis": ["zaza"],
            "cognitive enhancement": [],
            "commonly-misused substances": ["druggin"],
            "consumption method": ["nicotinepouche"],
            "awareness and advocacy": ["opiodcrisisawareness", "preventabledeath", "addictionrecoverycoach", "decriminalizenature", "childdeath", "drugcounselor"],
            "other substances": ["birthcontrol", "medicinal", "organic"],
            "platform": ["messytiktok", "duet", "trendingsounds", "tiktokusa"],
            "substance effects": ["euphoriaseason2"],
            "tobacco_nicotine": ["nicotinepouche", 

In [None]:
# def process_themes(themes_string):
#     try:
#         themes_dict = json.loads(themes_string)
#     except json.JSONDecodeError as e:
#         print(f"Error decoding JSON: {e}")
#         return pd.DataFrame(columns=['Category', 'Hashtag'])

#     # Step 3: Process the dictionary to create a pandas DataFrame
#     data = []
#     for category, hashtags in themes_dict.items():
#         for hashtag in hashtags:
#             data.append({'Category': category, 'Hashtag': hashtag})
    
#     df = pd.DataFrame(data)
#     return df


In [None]:
# themes_df = process_themes(themes)
# themes_df.to_csv("../data/themes100_gpt4o.csv")

---
## Experiment with parallelization

In [16]:
from pararallelize import process_posts_in_parallel
from get_themes import get_theme
import random

def chunk_list_to_df(input_list, chunk_size):
  """
  Chunks a list into smaller lists of approximately equal size and returns a DataFrame.

  Args:
      input_list: The list to be chunked.
      chunk_size: The desired size of each chunk.

  Returns:
      A DataFrame where each row contains a chunk of the original list.
  """
  chunks = []
  for i in range(0, len(input_list), chunk_size):
    chunks.append(input_list[i:i + chunk_size])

  df = pd.DataFrame({'hashtags': chunks})  
  df["hashtags_str"] = df["hashtags"].apply(lambda x: ', '.join(x))

  return df

hashtag_sample = random.sample(node_list, 100)
hashtag_chunks_df = chunk_list_to_df(hashtag_sample, 50)


In [17]:
hashtag_chunks_df["themes"] = process_posts_in_parallel(hashtag_chunks_df["hashtags_str"].to_list(), max_workers=2,model = "gpt-4o-2024-05-13", task = get_theme, client = client)

In [18]:
# export
hashtag_chunks_df.to_csv("../data/themes_100_parallel.csv")

In [19]:
# experient with 500 posts
hashtag_sample2 = random.sample(node_list, 500)
hashtag_chunks_df2 = chunk_list_to_df(hashtag_sample2, 50)

In [21]:
hashtag_chunks_df2["themes"] = process_posts_in_parallel(hashtag_chunks_df2["hashtags_str"].to_list(), max_workers=10,model = "gpt-4o-2024-05-13", task = get_theme, client = client)
hashtag_chunks_df2.to_csv("../data/themes_500_parallel.csv")

---
## Experiment with chunking

In [12]:
from get_themes import get_theme
import random
import csv
import json

def chunk_list(input_list, chunk_size):
  """
  Chunks a list into smaller lists of approximately equal size.

  Args:
      input_list: The list to be chunked.
      chunk_size: The desired size of each chunk.

  Returns:
      A list of lists, where each inner list is a chunk of the original list.
  """
  chunks = []
  for i in range(0, len(input_list), chunk_size):
    chunks.append(input_list[i:i + chunk_size])
  return chunks

def chunk_and_analyze_hashtags(hashtag_list, chunks, model, client, chunk_size=50, output_file='../data/hashtags_themes.csv'):
    # Shuffle the list to ensure random sampling
    random.shuffle(hashtag_list)
    
    # Chunk the list into batches 
    # chunks = [hashtag_list[i:i + chunk_size] for i in range(0, len(hashtag_list), chunk_size)]
    
    total_hashtags = 0
    
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Theme', 'Hashtag'])  # Write header
        
        for i, chunk in enumerate(chunks):
            # Convert chunk to comma-separated string
            print("length of chunk: ",len(chunk))
            chunk_str = ", ".join(chunk)
            
            # Get themes for this chunk
            themes_str = get_theme(chunk_str, model=model, client=client)
            print(themes_str)
            themes = json.loads(themes_str.replace("'", '"'))
            # Write results to CSV and count hashtags
            chunk_hashtag_count = 0
            for theme, hashtags in themes.items():
                for hashtag in hashtags:
                    csvwriter.writerow([theme, hashtag])
                    chunk_hashtag_count += 1
            
            total_hashtags += chunk_hashtag_count
            print(f"Processed chunk {i+1}/{len(chunks)}. Hashtags in this chunk: {chunk_hashtag_count}")
    
    print(f"Total hashtags processed: {total_hashtags}")
    print(f"Results have been written to {output_file}")
    return total_hashtags

In [11]:
hashtag_sample = random.sample(node_list, 100)
chunks = chunk_list(hashtag_sample, 50)
results = chunk_and_analyze_hashtags(hashtag_sample, model="gpt-4o-2024-05-13", client=client)

50


In [None]:
# for i, themes in enumerate(results):
#     print(f"Themes for chunk {i+1}:")
#     print(themes)
#     print("---")