# 1. Generate Prompt from Opinions QA dataset

In [1]:
from process_opinions_qa import generate_prompt
from helper import generate_responses_logprobs,get_reasoning_response
import random
import yaml
import os

# read yaml file
# change survey_type and context in config.yaml file for different types of prompts as described in the paper
with open('config.yaml') as file:
    config = yaml.safe_load(file)

manual=True # use config file if False, set manually if True
if manual:
    config=dict()
    config['prompt']=dict()
    config['prompt']['context']='default' # values: default, steer-qa, steer-bio, steer-portray
    config['prompt']['survey_type']='Pew_American_Trends_Panel_W26' # values: Pew_American_Trends_Panel_disagreement_500,Pew_American_Trends_Panel_W26, replace 26 with the [26,27,29,32..] etc.
    config['prompt']['output_path']='./'
    config['prompt']['include_output']=False
    config['prompt']['reference_index']=None
    config['do_reasoning']=False
    config['n_trials']=1
    
print(f"Context: {config['prompt']['context']} \nSurvey file: {config['prompt']['survey_type']}")

train, evals, question_dict=generate_prompt(config)

if (len(train)>1):
    prompts=[x+'\n'+y for x,y in zip(train, evals)]
else:
    prompts=evals

len(prompts)

Context: default 
Survey file: Pew_American_Trends_Panel_W26


78

# 2. Load DeepSeek

In [2]:
# ref: https://gist.github.com/vgel/8a2497dc45b1ded33287fa7bb6cc1adc
from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache
from tqdm.notebook import tqdm
import torch

device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_name='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained('/scratch/ss4yd/huggingfacemodels/DeepSeek-R1-Distill-Qwen-7B/', 
                                             torch_dtype=torch.bfloat16, device_map='auto')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# 3. Generate logprobs

In [3]:
reasoning_responses=list()
reasoning_logprobs=list()
input_ids=list()
for i in tqdm(range(config['n_trials']), desc='Trial'): 
    in_prompts=prompts[i*int(len(prompts)/config['n_trials']):(i+1)*int(len(prompts)/config['n_trials'])]
    # get logprobs and reasoning responses
    # Note: reasoning responses take longer to run, toggle to True in config if required.
    for j,prompt in enumerate(tqdm(in_prompts)):
        input_ids+=[f'id_trial{i}_n{j}']
        reasoning_logprobs.append(generate_responses_logprobs(prompt, model, tokenizer)['top_k_logprobs'])
        if config['do_reasoning']:
            reasoning_responses.append(get_reasoning_response(prompt, model, tokenizer, if_print=False))

Trial:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


# 4. Generate dataframe

In [4]:
import pandas as pd
df=pd.DataFrame()
df['input_ids']=input_ids
df['question']=question_dict['eval']['question']
df['question_raw']=question_dict['eval']['question_raw']
df['references']=question_dict['eval']['references']
df['mapping']=question_dict['eval']['mapping']
df['top_k_logprobs']=reasoning_logprobs
df['top_k_logprobs'][0]
if len(reasoning_responses)==len(input_ids): # check if reasoning response list is not empty
    df['reasoning_responses']=reasoning_responses
if len(train)>0:
    df['context']=train
df['prompt']=prompts

# 5. Save as SQLite3 Database

In [5]:
import sqlite3

model_output_dir=f"./model_outs/opinions_qa:survey={config['prompt']['survey_type']},context={config['prompt']['context']}"
os.makedirs(model_output_dir, exist_ok=True)
db_path=os.path.join(model_output_dir, 'model_output.sqlite')
conn = sqlite3.connect(db_path)
df=df.astype(str)
# Store the DataFrame in the SQLite database
df.to_sql('results', conn, if_exists='replace', index=False)
# Close the connection
conn.close()

# 7. (optional) save as CSV file

In [6]:
df.to_csv(f"./model_outs/opinions_qa:survey={config['prompt']['survey_type']},context={config['prompt']['context']}.csv", index=False)