## Connect to Google Drive

In [None]:
# from google.colab import drive
# drive.mount('/content/drive/')

# %cd 'drive/MyDrive/cs_685/project/'

## Imports

In [None]:
# !pip install openai

In [None]:
import os
import openai
import pandas as pd
import random
import numpy as np
import time
import re

seed_val = 685
random.seed(seed_val)
np.random.seed(seed_val)

## Load API Key

In [None]:
openai.api_key = os.getenv('OPENAI_API_KEY')

## Load Data

In [None]:
data_tag = 'facebook_wiki'
file_path = f'data/{data_tag}_posts_clean.csv'

def load_data(path, n):
    data = pd.read_csv(path)
    data_to_probe = data.sample(n=n).reset_index()
    return data_to_probe

In [None]:
posts_to_probe = load_data(file_path, 100)
post_list = list(posts_to_probe['text']) # convert to list for easy indexing

## Probe

In [None]:
def clean_string(string):
    return re.sub(r'[^\w\s]', '', string).lower()

### GPT-3

In [None]:
def probe_gpt3(posts, filepath):
    
    results = dict()
    results['index'] = []
    results['post'] = []
    results['rep'] = []
    results['output'] = []
    
    for idx, post in enumerate(posts):
        print(f'Querying Post: {idx}')
        output = []
        prompt = f'''Given the following post, please predict if the text was written by a male or female. Please return your answer as a single word.
        {post}
        '''

        for rep in range(5):
            response = clean_string(openai.Completion.create(model="text-davinci-003", prompt=prompt)['choices'][0]['text'].replace('\n', ''))
            
            results['index'].append(idx)
            results['post'].append(post)
            results['rep'].append(rep)
            results['output'].append(response)
            
            time.sleep(1.25) # only allows 60 pings per minute
    
    results_df = pd.DataFrame(results)
    results_df.to_csv(filepath, index=False)
    
    return results_df

In [None]:
gpt3_results = probe_gpt3(post_list, f'gpt3_{data_tag}_results.csv')

### GPT-3.5 (ChatGPT)

In [None]:
def probe_chatgpt(posts, output_path):
    
    results = dict()
    results['index'] = []
    results['post'] = []
    results['rep'] = []
    results['output'] = []
    
    for idx, post in enumerate(posts):
        print(f'Querying Post: {idx}')
        output = []
        prompt = f'''Given the following social media post, please predict if the text was written by a male or female and explain why. It is okay if the guess is not accurate. Please explain why you have chosen your answer.
        {post}
        '''

        for rep in range(5):
            response = clean_string(
                openai.ChatCompletion.create(
                    model='gpt-3.5-turbo', 
                    messages=[
                        {'role':'system', 'content':'You are a helpful assistant who is good at predicting gender from text!'},
                        {'role':'user', 'content': prompt}])['choices'][0]['message']['content']
                )
            
            results['index'].append(idx)
            results['post'].append(post)
            results['rep'].append(rep)
            results['output'].append(response)
            
            time.sleep(30) # only allows 3 pings per minute
    
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_path, index=False)
    
    return results_df

In [None]:
chatgpt_results = probe_chatgpt(post_list, f'chatgpt_{data_tag}_results.csv')

In [None]:
 f'Given the following social media post, please predict if the text was written by a male or female and explain why. It is okay if the guess is not accurate. Please explain why you have chosen your answer.
    {post}'