## **Import Packages & Load Data** 

In [None]:
!pip install openai
!pip install tqdm
!pip install inflect
import os
import openai

import pandas as pd
import numpy as np
from numpy.random import rand
from tqdm.notebook import tqdm
from random import randint
import inflect
### get api_key from openai account-->API Keys.
openai.api_key = ""

**Load Dataset**
*   Option A: find nytcrossword.csv in shared drive
*   Option B: download data from kaggle and upload nytcrossword.csv to local drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
### enter own path 
crosswords=pd.read_csv("/content/drive/MyDrive/Crosswords/nytcrosswords.csv", encoding='cp1252')


## **Data Prep**
Preparing all the prompting types

In [None]:
data=crosswords.sample(n=10000, replace=False, random_state=2022).reset_index()

In [None]:
def add_punctuation(s):
    s = s + '.' if s and s[-1] not in ('.', '!', '?') else s
    return s

### Initial Cleanup: drop duplicate, get answer length, fill in NULL answers as string NULL
def initial_cleanup(df):
    df.rename(columns={'Date':'date','Word':'answer','Clue':'clue'},inplace=True)
    df.answer=df.answer.str.lower()
    df.drop_duplicates(inplace=True)
    df.date = pd.to_datetime(df.date)
    df['answer_length'] = df.answer.apply(lambda x:len(str(x)))
    for row in df[df.answer_length <= 1].index:
        df.drop(row, inplace=True)
    df.fillna('NULL', inplace=True)
    df["clue"]= df['clue'].apply(add_punctuation)
    return df
### data conatins cleaned 10k sample 
data=initial_cleanup(data)

## crosswords has all the cleaned crosswords outside of 10k data sample
crosswords=crosswords[~crosswords.index.isin(data["index"].tolist())]
crosswords=initial_cleanup(crosswords)


**Baseline Prompt**:
"Solve this crossword puzzle by providing a 4 letter response to the clue. Clue:________. Answer: 

In [None]:

def baseline_prompt(df):
  #function to create zero shot baseline prompt
  df["baseline_prompt"]="Solve this crossword puzzle by providing a " + df['answer_length'].astype(str)+ " letter response to the clue.\n\nClue: "+ data['clue'].astype(str)+"\n\nAnswer:"

  return df

def get_samples(n):
    #function to get few shot baseline prompt
    s=crosswords.sample(n).reset_index()
    samples=""
    for i in range(n):
        samples = samples +"Solve this crossword puzzle by providing a " + s['answer_length'][i].astype(str)+\
    " letter response to the clue.\n\nClue: "+ s['clue'][i]+"\n\nAnswer: "+ s["answer"][i]+"\n\n"
    return samples
def few_shot_baseline_1(df,n):
    #function to create one shot baseline prompt
    for i in tqdm(range(len(df))):
        samples=get_samples(n)
        df.loc[i,"few_shot_baseline_1"]= samples +"Solve this crossword puzzle by providing a " + df['answer_length'][i].astype(str)+\
    " letter response to the clue.\n\nClue: "+ df['clue'][i]+"\n\nAnswer:"
    return df

def few_shot_baseline_3(df,n):
    #function to create three shot baseline prompt
    for i in tqdm(range(len(df))):
        samples=get_samples(n)
        df.loc[i,"few_shot_baseline_3"]= samples +"Solve this crossword puzzle by providing a " + df['answer_length'][i].astype(str)+\
    " letter response to the clue.\n\nClue: "+ df['clue'][i]+"\n\nAnswer:"
    return df


In [None]:
### Formatting prompting, this can take awhile
data=baseline_prompt(data)
data=few_shot_baseline_1(data,1)
data=few_shot_baseline_1(data,3)

**Constrained Prompt** "Solve this crossword puzzle by providing a 4 letter response to the clue.  The Nth letter is X. Clue: _________. Answer:"

In [None]:
p = inflect.engine()

def constrained_prompt_pp(df):
  #function to help set up zero shot constrained prompt 
    df["random"]= [(np.random.randint(1,x+1))  for x in df.answer_length]
    df["random_ordinal"] = [p.ordinal(x) for x in df.random]
    df["random_letter"]=df.apply(lambda x : get_letter(x['answer'],x['random']),axis=1)
    return df

def constrained_prompt(df):
    #function to set up zero shot constrained prompt 
    df["constrained_prompt"]="Solve this crossword puzzle by providing a " + \
                          df['answer_length'].astype(str)+ \
                          " letter response to the clue. The " + df["random_ordinal"].astype(str)+ " letter is " + df["random_letter"]+\
                          ".\n\nClue: " + data['clue'].astype(str) + "\n\nAnswer:"
                          #Solve this crossword puzzle by providing a 9 letter response to the clue. The 6th letter is p. Clue:__. Answer:
    return df
def get_letter(ans, position):
    return ans[position-1]

def fs_constrained_prompt_pp(df):
    #function to help set up few shot constrained prompt 
    df["fs_random"]= [(np.random.randint(1,x+1))  for x in df.answer_length]
    df["fs_random_ordinal"] = [p.ordinal(x) for x in df.fs_random]
    df["fs_random_letter"]=df.apply(lambda x : get_letter(x['answer'],x['fs_random']),axis=1)
    return df

def get_fs_samples(n):
    #function to help set up few shot constrained prompt examples
    s=crosswords.sample(n).reset_index() 
    s=fs_constrained_prompt_pp(s)
    samples=""
    for i in range(n):
        samples = samples +"Solve this crossword puzzle by providing a " + \
                          s['answer_length'][i].astype(str)+ \
                          " letter response to the clue. The " + s["fs_random_ordinal"][i]+ " letter is " + s["fs_random_letter"][i]+\
                          ".\n\nClue: " + s['clue'][i] +"\n\nAnswer:"+ s["answer"][i]+"\n\n"
    return samples

def few_shot_constrained_1(df,n):
  #function to set up one shot constrained prompt 
  df=fs_constrained_prompt_pp(df)
  for i in tqdm(range(len(df))):
    samples=get_fs_samples(n)
    df.loc[i,"few_shot_constrained_1"]= samples +"Solve this crossword puzzle by providing a " + \
                          df['answer_length'][i].astype(str)+ " letter response to the clue. The "+\
                            df["fs_random_ordinal"][i]+ " letter is " + df["fs_random_letter"][i]+\
                          ".\n\nClue: " + df['clue'][i] + "\n\nAnswer:"
    return df

def few_shot_constrained_3(df,n):
  #function to set up three shot constrained prompt 
  df=fs_constrained_prompt_pp(df)
  for i in tqdm(range(len(df))):
    samples=get_fs_samples(n)
    df.loc[i,"few_shot_constrained_3"]= samples +"Solve this crossword puzzle by providing a " + \
                          df['answer_length'][i].astype(str)+ " letter response to the clue. The "+\
                            df["fs_random_ordinal"][i]+ " letter is " + df["fs_random_letter"][i]+\
                          ".\n\nClue: " + df['clue'][i] + "\n\nAnswer:"
  return df 

In [None]:
data=constrained_prompt(data)
data=few_shot_constrained_1(data,1)
data=few_shot_constrained_1(data,3)

In [None]:
data.head()

# InstructGPT 
Feeding prompts to InstructGPT and saving results.



### **Batch Prompting Functions**

In [None]:
def run_batch_deterministic(df,prompts):
  df=df.reset_index()
  for prompt in prompts:
    print('running for:',prompt)
    df[f"{prompt}_output"] = " "
    for i in tqdm(range(len(df))):
        baseline_response = openai.Completion.create(
                    engine="text-davinci-002",
                    prompt=df.loc[i][prompt],
                    temperature=0,
                    #max_tokens=5,
                    top_p=1,
                    frequency_penalty=0,
                    presence_penalty=0)
        #print('got resp:',i)
        df.at[i,f"{prompt}_output"]=baseline_response["choices"][0]["text"].strip().lower()
  return df.copy()


In [None]:
### Breaking up sample to batches in case runtime breaks or openAI sever goes down
sample1=data.iloc[:1000,:]
sample2=data.iloc[1000:2000,:]
sample3=data.iloc[2000:3000,:]
sample4=data.iloc[3000:4000,:]
sample5=data.iloc[4000:5000,:]
sample6=data.iloc[5000:6000,:]
sample7=data.iloc[6000:7000,:]
sample8=data.iloc[7000:8000,:]
sample9=data.iloc[8000:9000,:]
sample10=data.iloc[9000:,:]

## **Zero Shot**

In [None]:
batch_results_1=run_batch_deterministic(sample1,['baseline_prompt'])
batch_results_1.to_csv("/content/drive/MyDrive/Crosswords/baseline_results_1.csv", encoding='cp1252')
batch_results_2=run_batch_deterministic(sample2,['baseline_prompt'])
batch_results_2.to_csv("/content/drive/MyDrive/Crosswords/baseline_results_2.csv", encoding='cp1252')
batch_results_3=run_batch_deterministic(sample3,['baseline_prompt'])
batch_results_3.to_csv("/content/drive/MyDrive/Crosswords/baseline_results_3.csv", encoding='cp1252')
batch_results_4=run_batch_deterministic(sample4,['baseline_prompt'])
batch_results_4.to_csv("/content/drive/MyDrive/Crosswords/baseline_results_4.csv", encoding='cp1252')
batch_results_5=run_batch_deterministic(sample5,['baseline_prompt'])
batch_results_5.to_csv("/content/drive/MyDrive/Crosswords/baseline_results_5.csv", encoding='cp1252')
batch_results_6=run_batch_deterministic(sample6,['baseline_prompt'])
batch_results_6.to_csv("/content/drive/MyDrive/Crosswords/baseline_results_6.csv", encoding='cp1252')
batch_results_7=run_batch_deterministic(sample7,['baseline_prompt'])
batch_results_7.to_csv("/content/drive/MyDrive/Crosswords/baseline_results_7.csv", encoding='cp1252')
batch_results_8=run_batch_deterministic(sample8,['baseline_prompt'])
batch_results_8.to_csv("/content/drive/MyDrive/Crosswords/baseline_results_8.csv", encoding='cp1252')
batch_results_9=run_batch_deterministic(sample9,['baseline_prompt'])
batch_results_9.to_csv("/content/drive/MyDrive/Crosswords/baseline_results_9.csv", encoding='cp1252')
batch_results_10=run_batch_deterministic(sample10,['baseline_prompt'])
batch_results_10.to_csv("/content/drive/MyDrive/Crosswords/baseline_results_10.csv", encoding='cp1252')

In [None]:
batch_results_1=run_batch_deterministic(sample1,['constrained_prompt'])
batch_results_1.to_csv("/content/drive/MyDrive/Crosswords/constrained_results_1.csv", encoding='cp1252')
batch_results_2=run_batch_deterministic(sample2,['constrained_prompt'])
batch_results_2.to_csv("/content/drive/MyDrive/Crosswords/constrained_results_1.csv", encoding='cp1252')
batch_results_3=run_batch_deterministic(sample3,['constrained_prompt'])
batch_results_3.to_csv("/content/drive/MyDrive/Crosswords/constrained_results_1.csv", encoding='cp1252')
batch_results_4=run_batch_deterministic(sample4,['constrained_prompt'])
batch_results_4.to_csv("/content/drive/MyDrive/Crosswords/constrained_results_1.csv", encoding='cp1252')
batch_results_5=run_batch_deterministic(sample5,['constrained_prompt'])
batch_results_5.to_csv("/content/drive/MyDrive/Crosswords/constrained_results_1.csv", encoding='cp1252')
batch_results_6=run_batch_deterministic(sample6,['constrained_prompt'])
batch_results_6.to_csv("/content/drive/MyDrive/Crosswords/constrained_results_1.csv", encoding='cp1252')
batch_results_7=run_batch_deterministic(sample7,['constrained_prompt'])
batch_results_7.to_csv("/content/drive/MyDrive/Crosswords/constrained_results_1.csv", encoding='cp1252')
batch_results_8=run_batch_deterministic(sample8,['constrained_prompt'])
batch_results_8.to_csv("/content/drive/MyDrive/Crosswords/constrained_results_1.csv", encoding='cp1252')
batch_results_9=run_batch_deterministic(sample9,['constrained_prompt'])
batch_results_9.to_csv("/content/drive/MyDrive/Crosswords/constrained_results_1.csv", encoding='cp1252')
batch_results_10=run_batch_deterministic(sample10,['constrained_prompt'])
batch_results_10.to_csv("/content/drive/MyDrive/Crosswords/constrained_results_1.csv", encoding='cp1252')

## **One Shot**

In [None]:
batch_results_1=run_batch_deterministic(sample1,["few_shot_baseline_1"])
batch_results_1.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_1_results_1.csv", encoding='cp1252')
batch_results_2=run_batch_deterministic(sample2,["few_shot_baseline_1"])
batch_results_2.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_1_results_2.csv", encoding='cp1252')
batch_results_3=run_batch_deterministic(sample3,["few_shot_baseline_1"])
batch_results_3.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_1_results_3.csv", encoding='cp1252')
batch_results_4=run_batch_deterministic(sample4,["few_shot_baseline_1"])
batch_results_4.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_1_results_4.csv", encoding='cp1252')
batch_results_5=run_batch_deterministic(sample5,["few_shot_baseline_1"])
batch_results_5.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_1_results_5.csv", encoding='cp1252')
batch_results_6=run_batch_deterministic(sample6,["few_shot_baseline_1"])
batch_results_6.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_1_results_6.csv", encoding='cp1252')
batch_results_7=run_batch_deterministic(sample7,["few_shot_baseline_1"])
batch_results_7.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_1_results_7.csv", encoding='cp1252')
batch_results_8=run_batch_deterministic(sample8,["few_shot_baseline_1"])
batch_results_8.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_1_results_8.csv", encoding='cp1252')
batch_results_9=run_batch_deterministic(sample9,["few_shot_baseline_1"])
batch_results_9.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_1_results_9.csv", encoding='cp1252')
batch_results_10=run_batch_deterministic(sample10,["few_shot_baseline_1"])
batch_results_10.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_1_results_10.csv", encoding='cp1252')


In [None]:
batch_results_1=run_batch_deterministic(sample1,["few_shot_constrained_1"])
batch_results_1.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_1_results_1.csv", encoding='cp1252')
batch_results_2=run_batch_deterministic(sample2,["few_shot_constrained_1"])
batch_results_2.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_1_results_2.csv", encoding='cp1252')
batch_results_3=run_batch_deterministic(sample3,["few_shot_constrained_1"])
batch_results_3.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_1_results_3.csv", encoding='cp1252')
batch_results_4=run_batch_deterministic(sample4,["few_shot_constrained_1"])
batch_results_4.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_1_results_4.csv", encoding='cp1252')
batch_results_5=run_batch_deterministic(sample5,["few_shot_constrained_1"])
batch_results_5.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_1_results_5.csv", encoding='cp1252')
batch_results_6=run_batch_deterministic(sample6,["few_shot_constrained_1"])
batch_results_6.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_1_results_6.csv", encoding='cp1252')
batch_results_7=run_batch_deterministic(sample7,["few_shot_constrained_1"])
batch_results_7.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_1_results_7.csv", encoding='cp1252')
batch_results_8=run_batch_deterministic(sample8,["few_shot_constrained_1"])
batch_results_8.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_1_results_8.csv", encoding='cp1252')
batch_results_9=run_batch_deterministic(sample9,["few_shot_constrained_1"])
batch_results_9.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_1_results_9.csv", encoding='cp1252')
batch_results_10=run_batch_deterministic(sample10,["few_shot_constrained_1"])
batch_results_10.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_1_results_10.csv", encoding='cp1252')

## **Three Shot**

In [None]:
batch_results_1=run_batch_deterministic(sample1,["few_shot_baseline_3"])
batch_results_1.to_csv("/content/drive/MyDrive/Crosswords/few_shot_basekine_3_results_1.csv", encoding='cp1252')
batch_results_2=run_batch_deterministic((sample2,["few_shot_baseline_3"])
batch_results_2.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_3_results_2.csv", encoding='cp1252')
batch_results_3=run_batch_deterministic(sample3,["few_shot_baseline_3"])
batch_results_3.to_csv("/content/drive/MyDrive/Crosswords/few_shot_basline_3_results_3.csv", encoding='cp1252')
batch_results_4=run_batch_deterministic(sample4,["few_shot_baseline_3"])
batch_results_4.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_3_results_4.csv", encoding='cp1252')
batch_results_5=run_batch_deterministic(sample5,["few_shot_baseline_3"])
batch_results_5.to_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_3_results_5.csv", encoding='cp1252')

In [None]:
batch_results_1=run_batch_deterministic(sample1,["few_shot_constrained_3"])
batch_results_1.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_3_results_1.csv", encoding='cp1252')
batch_results_2=run_batch_deterministic(sample2,["few_shot_constrained_3"])
batch_results_2.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_3_results_2.csv", encoding='cp1252')
batch_results_3=run_batch_deterministic(sample3,["few_shot_constrained_3"])
batch_results_3.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_3_results_3.csv", encoding='cp1252')
batch_results_4=run_batch_deterministic(sample4,["few_shot_constrained_3"])
batch_results_4.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_3_results_4.csv", encoding='cp1252')
batch_results_5=run_batch_deterministic(sample5,["few_shot_constrained_3"])
batch_results_5.to_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_3_results_5.csv", encoding='cp1252')

## **Creating Master Result File**
Sample Code

In [None]:
### Example code of how to concat all results back into one file
part1=pd.read_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_3_results_1.csv", encoding='cp1252',index_col=0)
part2=pd.read_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_3_results_2.csv", encoding='cp1252',index_col=0)
part3=pd.read_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_3_results_3.csv", encoding='cp1252',index_col=0)
part4=pd.read_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_3_results_4.csv", encoding='cp1252',index_col=0)
part5=pd.read_csv("/content/drive/MyDrive/Crosswords/few_shot_baseline_3_results_5.csv", encoding='cp1252',index_col=0)


fs_baseline3=part1.append(part2,ignore_index=True)\
.append(part3,ignore_index=True)\
.append(part4,ignore_index=True)\
.append(part5,ignore_index=True)\


fs_baseline3=fs_baseline3.drop(columns=['level_0'])
fs_baseline3.fillna('nan', inplace=True)
fs_baseline3.to_csv("/content/drive/MyDrive/Crosswords/full_5k_sample_fs_baseline_3.csv", encoding='cp1252')


part1=pd.read_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_3_results_1.csv", encoding='cp1252',index_col=0)
part2=pd.read_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_3_results_2.csv", encoding='cp1252',index_col=0)
part3=pd.read_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_3_results_3.csv", encoding='cp1252',index_col=0)
part4=pd.read_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_3_results_4.csv", encoding='cp1252',index_col=0)
part5=pd.read_csv("/content/drive/MyDrive/Crosswords/few_shot_constrained_3_results_5.csv", encoding='cp1252',index_col=0)


fs_constrained3=part1.append(part2,ignore_index=True)\
.append(part3,ignore_index=True)\
.append(part4,ignore_index=True)\
.append(part5,ignore_index=True)\


fs_constrained3=fs_constrained3.drop(columns=['level_0'])
fs_constrained3.fillna('nan', inplace=True)
fs_constrained3.to_csv("/content/drive/MyDrive/Crosswords/full_5k_sample_fs_constrained_3.csv", encoding='cp1252')


In [None]:
### 6 master files can then merged into ONE MEGA FILE with all results

### example of merging two files
fs_3=pd.merge(fs_baseline3, fs_constrained3, left_index=True, right_index=True)
fs_3.to_csv("/content/drive/MyDrive/Crosswords/full_5k_sample.csv", encoding='cp1252')")