In [1]:
import pandas as pd
import numpy as np
from numpy.random import rand
from tqdm.notebook import tqdm
from random import randint
import inflect

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
### 10k samples, only ran for zero shot and one shot
master10=pd.read_csv("/content/drive/MyDrive/Crosswords/full_10k_sample_fs_1.csv", encoding='cp1252',index_col=0) #change_path
### 5k samples, ran for zero shot, one shot and three shot
master5=pd.read_csv("/content/drive/MyDrive/Crosswords/full_5k_sample_fs.csv", encoding='cp1252',index_col=0) #change path

In [5]:
master10.columns

Index(['index', 'date', 'answer', 'clue', 'answer_length', 'baseline_prompt',
       'baseline_prompt_output', 'random', 'random_ordinal', 'random_letter',
       'constrained_prompt', 'constrained_prompt_output', 'fs_random',
       'fs_random_ordinal', 'fs_random_letter', 'few_shot_constrained_1',
       'few_shot_constrained_1_output', 'few_shot_baseline_1',
       'few_shot_baseline_1_output'],
      dtype='object')

In [6]:
##EVAL FUNCTIONS
def acc_eval(df,batch_output_column_name):
    acc=np.sum(df['answer']==df[batch_output_column_name])/len(df)
    return acc

def percent_punct(df,batch_output_column_name):
    df['no_punct'] = df[batch_output_column_name].apply(lambda x:str(x).isalpha())
    punt_percent=1-np.sum(df['no_punct'])/len(df)
    return punt_percent

def acc_norm_eval(df,batch_output_column_name):
    #df['cleaned_output']=df[batch_output_column_name].str.replace(r'[\W\s]+', '')
    df['cleaned_output']=df[batch_output_column_name].apply(lambda x: strip_punct(str(x)))
    acc_norm=np.sum(df['answer']==df['cleaned_output'])/len(df)
    return acc_norm

def letter_match(df,batch_output_column_name):
    df['output_length'] = df[batch_output_column_name].apply(lambda x:len(str(x)))
    match=np.sum(df["output_length"]==df['answer_length'])/len(df)
    return match

def norm_letter_match(df,batch_output_column_name):
    df['cleaned_output']=df[batch_output_column_name].apply(lambda x: strip_punct(str(x)))
    df['output_length'] = df['cleaned_output'].apply(lambda x:len(str(x)))
    match=np.sum(df["output_length"]==df['answer_length'])/len(df)
    return match


def strip_punct(s):
    s = ''.join(filter(str.isalnum, s)).lower()
    return s
  
def constraint_match(df,batch_output_column_name, is_fs=None):
    ### matches in letter and matches with the letter at the right spot
    if is_fs is None:
      is_fs = 'few_shot' in batch_output_column_name
    if is_fs:
      random_col = 'fs_random'
    else:
      random_col = 'random'
      
    df['constrained_output_letter'] = [str(row[batch_output_column_name])[row[random_col]-1] if row[random_col] <= len(str(row[batch_output_column_name])) else None for _, row in df.iterrows()]
    df['is_match'] = df['{}_letter'.format(random_col)].eq(df['constrained_output_letter'])
    match = np.sum(df['is_match']) / len(df)
    return match

def constraint_norm_match(df,batch_output_column_name):
    ### matches in letter and matches with the letter at the right spot
    ## TO-DO
    df['cleaned_output'] = df[batch_output_column_name].apply(lambda x: strip_punct(str(x)))
    is_fs = 'few_shot' in batch_output_column_name
    return constraint_match(df, 'cleaned_output', is_fs)

In [7]:
print("EM ACCURACY")
### Running accurcy for 10k samples
prompt_type_10=['baseline_prompt_output','constrained_prompt_output', 'few_shot_baseline_1_output','few_shot_constrained_1_output']
for prompt in prompt_type_10:
  print(prompt)
  print(acc_eval(master10,prompt))
print("-------")
### running accuracy for 5k samples: 
prompt_type_5=['baseline_prompt_output','constrained_prompt_output', 'few_shot_baseline_1_output','few_shot_constrained_1_output','few_shot_baseline_3_output','few_shot_constrained_3_output']
for prompt in prompt_type_5:
  print(prompt)
  print(acc_eval(master5,prompt))

EM ACCURACY
baseline_prompt_output
0.2135
constrained_prompt_output
0.2706
few_shot_baseline_1_output
0.3304
few_shot_constrained_1_output
0.3594
-------
baseline_prompt_output
0.2144
constrained_prompt_output
0.2774
few_shot_baseline_1_output
0.3374
few_shot_constrained_1_output
0.362
few_shot_baseline_3_output
0.3582
few_shot_constrained_3_output
0.3756


In [8]:
print("EM NORM") #Accuracy after striping punctuation and spaces
prompt_type_10=['baseline_prompt_output','constrained_prompt_output', 'few_shot_baseline_1_output','few_shot_constrained_1_output']
for prompt in prompt_type_10:
  print(prompt)
  print(acc_norm_eval(master10,prompt))
print("-------")
### running accuracy for 5k samples: 
prompt_type_5=['baseline_prompt_output','constrained_prompt_output', 'few_shot_baseline_1_output','few_shot_constrained_1_output','few_shot_baseline_3_output','few_shot_constrained_3_output']
for prompt in prompt_type_5:
  print(prompt)
  print(acc_norm_eval(master5,prompt))

EM NORM
baseline_prompt_output
0.2867
constrained_prompt_output
0.3325
few_shot_baseline_1_output
0.3499
few_shot_constrained_1_output
0.3768
-------
baseline_prompt_output
0.2904
constrained_prompt_output
0.3386
few_shot_baseline_1_output
0.3554
few_shot_constrained_1_output
0.3762
few_shot_baseline_3_output
0.3756
few_shot_constrained_3_output
0.3892


In [9]:
print("PERCENT PUNCTUATION") ## 1 shot was effective in showing gpt that answers do not contain punctuation
prompt_type_10=['baseline_prompt_output','constrained_prompt_output', 'few_shot_baseline_1_output','few_shot_constrained_1_output']
for prompt in prompt_type_10:
  print(prompt)
  print(percent_punct(master10,prompt))
print("-------")
### running accuracy for 5k samples: 
prompt_type_5=['baseline_prompt_output','constrained_prompt_output', 'few_shot_baseline_1_output','few_shot_constrained_1_output','few_shot_baseline_3_output','few_shot_constrained_3_output']
for prompt in prompt_type_5:
  print(prompt)
  print(percent_punct(master5,prompt))

PERCENT PUNCTUATION
baseline_prompt_output
0.39349999999999996
constrained_prompt_output
0.3116
few_shot_baseline_1_output
0.06879999999999997
few_shot_constrained_1_output
0.06669999999999998
-------
baseline_prompt_output
0.39239999999999997
constrained_prompt_output
0.3014
few_shot_baseline_1_output
0.06340000000000001
few_shot_constrained_1_output
0.059599999999999986
few_shot_baseline_3_output
0.07120000000000004
few_shot_constrained_3_output
0.06920000000000004


In [10]:
print("LETTER MATCH") ## more examples allowed model to learn importance of letter size clue
prompt_type_10=['baseline_prompt_output','constrained_prompt_output', 'few_shot_baseline_1_output','few_shot_constrained_1_output']
for prompt in prompt_type_10:
  print(prompt)
  print(letter_match(master10,prompt))
print("-------")
### running accuracy for 5k samples: 
prompt_type_5=['baseline_prompt_output','constrained_prompt_output', 'few_shot_baseline_1_output','few_shot_constrained_1_output','few_shot_baseline_3_output','few_shot_constrained_3_output']
for prompt in prompt_type_5:
  print(prompt)
  print(letter_match(master5,prompt))

LETTER MATCH
baseline_prompt_output
0.4958
constrained_prompt_output
0.5317
few_shot_baseline_1_output
0.6742
few_shot_constrained_1_output
0.6811
-------
baseline_prompt_output
0.4956
constrained_prompt_output
0.5408
few_shot_baseline_1_output
0.681
few_shot_constrained_1_output
0.6926
few_shot_baseline_3_output
0.66
few_shot_constrained_3_output
0.6792


In [11]:
print("Norm LETTER MATCH") ## more examples allowed model to learn importance of letter size clue
prompt_type_10=['baseline_prompt_output','constrained_prompt_output', 'few_shot_baseline_1_output','few_shot_constrained_1_output']
for prompt in prompt_type_10:
  print(prompt)
  print(norm_letter_match(master10,prompt))
print("-------")
### running accuracy for 5k samples: 
prompt_type_5=['baseline_prompt_output','constrained_prompt_output', 'few_shot_baseline_1_output','few_shot_constrained_1_output','few_shot_baseline_3_output','few_shot_constrained_3_output']
for prompt in prompt_type_5:
  print(prompt)
  print(norm_letter_match(master5,prompt))

Norm LETTER MATCH
baseline_prompt_output
0.6321
constrained_prompt_output
0.6366
few_shot_baseline_1_output
0.6956
few_shot_constrained_1_output
0.698
-------
baseline_prompt_output
0.6354
constrained_prompt_output
0.6456
few_shot_baseline_1_output
0.6984
few_shot_constrained_1_output
0.707
few_shot_baseline_3_output
0.6778
few_shot_constrained_3_output
0.6942


In [12]:
# calculate constraint match for constrained prompts
print("CONSTRAINT NORM MATCH") 
prompt_type_10=['constrained_prompt_output', 'few_shot_constrained_1_output']
for prompt in prompt_type_10:
  print(prompt)
  print(constraint_norm_match(master10,prompt))
print("-------")
### running accuracy for 5k samples: 
prompt_type_5=['constrained_prompt_output', 'few_shot_constrained_3_output']
for prompt in prompt_type_5:
  print(prompt)
  print(constraint_norm_match(master5,prompt))

CONSTRAINT NORM MATCH
constrained_prompt_output
0.5298
few_shot_constrained_1_output
0.5749
-------
constrained_prompt_output
0.535
few_shot_constrained_3_output
0.5708


In [13]:
# calculate constraint match for constrained prompts
print("CONSTRAINT MATCH") 
prompt_type_10=['constrained_prompt_output', 'few_shot_constrained_1_output']
for prompt in prompt_type_10:
  print(prompt)
  print(constraint_match(master10,prompt))
print("-------")
### running accuracy for 5k samples: 
prompt_type_5=['constrained_prompt_output', 'few_shot_constrained_3_output']
for prompt in prompt_type_5:
  print(prompt)
  print(constraint_match(master5,prompt))

CONSTRAINT MATCH
constrained_prompt_output
0.5147
few_shot_constrained_1_output
0.5667
-------
constrained_prompt_output
0.5226
few_shot_constrained_3_output
0.5642
