# 1. LOAD DeepSeek

In [1]:
# ref: https://gist.github.com/vgel/8a2497dc45b1ded33287fa7bb6cc1adc
from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache
import torch

device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_name='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B'
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained('/scratch/ss4yd/huggingfacemodels/DeepSeek-R1-Distill-Qwen-32B/', 
                                             torch_dtype=torch.bfloat16, device_map='auto')

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [3]:
import pandas as pd
pd.read_csv('./data/model_input/UVA_Data.csv', sep='\t')

Unnamed: 0,question,options
0,Please think about what things will be like in...,"['Very optimistic', 'Somewhat optimistic', \r\..."
1,"Over the next 30 years, do you think that the ...","['Get better', 'Stay about the same', \r\n'Get..."
2,Thinking about how Chinese society sees men th...,['Mostpeoplelook uptomenwhoaremanlyormasculine...
3,"Over the next 30 years, do you think that crim...","['Get better', 'Stay about the same', \r\n'Get..."
4,"Over the next 30 years, do you think that yout...","['Get better', 'Stay about the same', \r\n'Get..."
5,"Over the next 30 years, do you think that Chin...","['Get better', 'Stay about the same', \r\n'Get..."
6,"Over the next 30 years, do you think that Chin...","['Get better', 'Stay about the same', \r\n'Get..."
7,"Over the next 30 years, do you think that soci...","['Get better', 'Stay about the same', \r\n'Get..."
8,Do you think China's current economic conditio...,"['Helping a lot','Helping a little', 'Hurting ..."
9,Do you think China's current economic conditio...,"['Helping a lot','Helping a little', 'Hurting ..."


# 2. Generate Prompt from Opinions QA dataset

In [4]:
from process_opinions_qa import generate_prompt
import random
import yaml

# read yaml file
# change survey_type and context in config.yaml file for different types of prompts as described in the paper
with open('config.yaml') as file:
    config = yaml.safe_load(file)

manual=True # use config file if False, set manually if True
if manual:
    config=dict()
    config['prompt']=dict()
    config['prompt']['context']='default' # values: default, steer-qa, steer-bio, steer-portray
    config['prompt']['survey_type']='UVA_Data' # values: Pew_American_Trends_Panel_disagreement_500,Pew_American_Trends_Panel_W26, replace 26 with the [26,27,29,32..] etc.
    config['prompt']['output_path']='./'
    config['prompt']['include_output']=False
    config['prompt']['reference_index']=None

print(f"Context: {config['prompt']['context']} \nSurvey file: {config['prompt']['survey_type']}")

train, evals=generate_prompt(config)

# print(train[0])

# while True:
if len(train)>0:
    prompt=train[random.choice(range(len(train)))]+'\n'+evals[random.choice(range(len(evals)))]
else:
    prompt=evals[random.choice(range(len(evals)))]

#     if 'Xi' in prompt:
#         break
    

print('\nGenerated prompt:')
print(prompt)

Context: default 
Survey file: UVA_Data

Generated prompt:
Question: Over the next 30 years, do you think that social stability in China will
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer:


# 3. Prompt model with chain-of-thought prompting

Taken from: https://gist.github.com/vgel/8a2497dc45b1ded33287fa7bb6cc1adc

Code forces the model to generate at least 128 tokens before stopping. The code also used replacement tokens to bridge this.

In [6]:
# ref: https://gist.github.com/vgel/8a2497dc45b1ded33287fa7bb6cc1adc

_, _start_think_token, end_think_token = tokenizer.encode("<think></think>")


replacements=["\nWait, but", "\nHmm", "\nSo"]
@torch.inference_mode
def reasoning_effort(question: str, min_thinking_tokens: int):
    tokens = tokenizer.apply_chat_template(
        [
            {"role": "user", "content": question},
            {"role": "assistant", "content": "<think>\n" + ""},
        ],
        continue_final_message=True,
        return_tensors="pt",
    )
    tokens = tokens.to(model.device)
    kv = DynamicCache()
    n_thinking_tokens = 0

    print(tokenizer.decode(list(tokens[0])))
    while True:
        out = model(input_ids=tokens, past_key_values=kv, use_cache=True)
        next_token = torch.multinomial(
            torch.softmax(out.logits[0, -1, :], dim=-1), 1
        ).item()
        kv = out.past_key_values

        if (
            next_token in (end_think_token, model.config.eos_token_id)
            and n_thinking_tokens < min_thinking_tokens
        ):
            replacement = random.choice(replacements)
            print(replacement)
            replacement_tokens = tokenizer.encode(replacement)
            n_thinking_tokens += len(replacement_tokens)
            tokens = torch.tensor([replacement_tokens]).to(tokens.device)
        elif next_token == model.config.eos_token_id:
            break
        else:
            dec_token = tokenizer.decode([next_token])
            yield tokenizer.decode([next_token])
            n_thinking_tokens += 1
            tokens = torch.tensor([[next_token]]).to(tokens.device)


## 3a. Run model with CoT prompting

In [21]:
def get_reasoning_response(prompt, if_print=True):
    think_responses=list()
    for chunk in reasoning_effort(prompt, 128):
        if if_print:
            print(chunk, end="", flush=True)
        think_responses.append(chunk)
    return ''.join(think_responses)

# 3b Run model without CoT prompting

This code was adapted from https://github.com/stanford-crfm/helm cited in the opinions-qa github repo to reproduce their results. It forces the model to generate only one token, which is usually one of the options presented in the prompt.

Ref:https://github.com/stanford-crfm/helm/blob/main/src/helm/clients/huggingface_client.py

In [12]:
def generate_logprob_dictionary(logprobs, tokenizer, k=10):
    res_dict=dict()
    topklogprobs=logprobs.topk(k=k)
    logprob_vals=topklogprobs.values
    logprob_inds=topklogprobs.indices
    
    res_dict['text']=tokenizer.decode([logprob_inds[0].item()])
    res_dict['logprob']=logprob_vals[0].item()
    res_dict['top_logprobs']=dict()
    for i in range(k):
        res_dict['top_logprobs'][tokenizer.decode([logprob_inds[i].item()])]= \
            logprob_vals[i].item()
        
    return res_dict

def generate_responses_logprobs(prompt, model, tokenizer):
    stopping_criteria = None
    raw_request={
        "prompt":prompt,
        "stop_sequences": [],
        "temperature":1e-7,
        "max_new_tokens":1,
        "top_p":1,
        "num_return_sequences":1,
        "echo_prompt":False,
    }
    
    encoded_input = tokenizer(raw_request["prompt"], return_tensors="pt", return_token_type_ids=False).to(device)

    with torch.no_grad():
        output = model.generate(
                        **encoded_input,
        #                 temperature=raw_request["temperature"],
                        num_return_sequences=raw_request["num_return_sequences"],
                        max_new_tokens=raw_request["max_new_tokens"],
                        top_p=raw_request["top_p"],
                        do_sample=False,
                        return_dict_in_generate=True,
                        output_scores=True,
                        stopping_criteria=stopping_criteria,
        #                 top_k=raw_request["top_k"],
                    )
    sequences = output.sequences
    scores = output.scores

    print(tokenizer.decode(sequences[0]))

    # calculate log probs when scores do not return inf
    if torch.isinf(scores[0]).sum().item()==0:

        # Compute logprobs of generated tokens for each completed sequence.
        all_generated_tokens_logprobs = []
        for completion_id in range(raw_request["num_return_sequences"]):
    #         print(f'completion id: {completion_id}')
            generated_tokens_logprobs = []
            for i in range(len(sequences[completion_id]) - len(encoded_input.input_ids[0])):
    #             print(f'i: {i}')
                logprobs = torch.nn.functional.log_softmax(scores[i][completion_id], dim=0)
    #             print(f'logprobs: {len(logprobs)}')
                # Get log probability of chosen token.
                j = i + len(encoded_input.input_ids[0])
    #             print(f'j: {j}')
                generated_tokens_logprobs.append(logprobs[sequences[completion_id][j]].item())
            all_generated_tokens_logprobs.append(generated_tokens_logprobs)

        print(f'Log Probs: {all_generated_tokens_logprobs}')
    else:
        print('Log Probs not calculated')
        
    
    res_dict=generate_logprob_dictionary(logprobs, tokenizer)
    return res_dict

In [29]:
from tqdm.notebook import tqdm
reasoning_responses=list()
reasoning_logprobs=list()
for prompt in tqdm(evals):
    reasoning_responses.append(get_reasoning_response(prompt, if_print=False))
    reasoning_logprobs.append(generate_responses_logprobs(prompt, model, tokenizer))

  0%|          | 0/16 [00:00<?, ?it/s]

<｜begin▁of▁sentence｜><｜User｜>Question: Please think about what things will be like in 2050, about 30 years from now. 
Thinking about the future of the People's Republic of China, would you say you are
A. Very optimistic
B. Somewhat optimistic
C. Somewhat pessimistic
D. Very pessimistic
E. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Please think about what things will be like in 2050, about 30 years from now. 
Thinking about the future of the People's Republic of China, would you say you are
A. Very optimistic
B. Somewhat optimistic
C. Somewhat pessimistic
D. Very pessimistic
E. Refused
Answer: A
Log Probs: [[-0.8920373320579529]]
<｜begin▁of▁sentence｜><｜User｜>Question: Over the next 30 years, do you think that the average Chinese family will see 
its standard of living
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Over the next 30 years, do you think that the average Chinese family will see 
its standard of living
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer: A
Log Probs: [[-0.6002359390258789]]
<｜begin▁of▁sentence｜><｜User｜>Question: Thinking about how Chinese society sees men these days, in general, would you say
A. Mostpeoplelook uptomenwhoaremanlyormasculine
B. Most people look down on men who are manly or masculine
C. Neither
D. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Thinking about how Chinese society sees men these days, in general, would you say
A. Mostpeoplelook uptomenwhoaremanlyormasculine
B. Most people look down on men who are manly or masculine
C. Neither
D. Refused
Answer: A
Log Probs: [[-0.5656664967536926]]
<｜begin▁of▁sentence｜><｜User｜>Question: Over the next 30 years, do you think that crime rates in Chinese cities will 
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Over the next 30 years, do you think that crime rates in Chinese cities will 
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer: A
Log Probs: [[-0.734650194644928]]
<｜begin▁of▁sentence｜><｜User｜>Question: Over the next 30 years, do you think that youth unemployment in China will
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Over the next 30 years, do you think that youth unemployment in China will
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer: A
Log Probs: [[-0.6193665862083435]]
<｜begin▁of▁sentence｜><｜User｜>Question: Over the next 30 years, do you think that Chinese demographic problems will
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Over the next 30 years, do you think that Chinese demographic problems will
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer: C
Log Probs: [[-0.5558454990386963]]
<｜begin▁of▁sentence｜><｜User｜>Question: Over the next 30 years, do you think that Chinese birthrates will 
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Over the next 30 years, do you think that Chinese birthrates will 
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer: C
Log Probs: [[-1.0038158893585205]]
<｜begin▁of▁sentence｜><｜User｜>Question: Over the next 30 years, do you think that social stability in China will
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Over the next 30 years, do you think that social stability in China will
A. Get better
B. Stay about the same
C. Get worse
D. Refused
Answer: A
Log Probs: [[-1.1590733528137207]]
<｜begin▁of▁sentence｜><｜User｜>Question: Do you think China's current economic conditions are helping or hurting Han people?
A. Helping a lot
B. Helping a little
C. Hurting a little
D. Hurting a lot
E. Neither helping nor hurting
F. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Do you think China's current economic conditions are helping or hurting Han people?
A. Helping a lot
B. Helping a little
C. Hurting a little
D. Hurting a lot
E. Neither helping nor hurting
F. Refused
Answer: E
Log Probs: [[-1.357311487197876]]
<｜begin▁of▁sentence｜><｜User｜>Question: Do you think China's current economic conditions are helping or hurting Uyghur people?
A. Helping a lot
B. Helping a little
C. Hurting a little
D. Hurting a lot
E. Neither helping nor hurting
F. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Do you think China's current economic conditions are helping or hurting Uyghur people?
A. Helping a lot
B. Helping a little
C. Hurting a little
D. Hurting a lot
E. Neither helping nor hurting
F. Refused
Answer: E
Log Probs: [[-1.5869132280349731]]
<｜begin▁of▁sentence｜><｜User｜>Question: Do you think China's current economic conditions are helping or hurting Tibetan people?
A. Helping a lot
B. Helping a little
C. Hurting a little
D. Hurting a lot
E. Neither helping nor hurting
F. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Do you think China's current economic conditions are helping or hurting Tibetan people?
A. Helping a lot
B. Helping a little
C. Hurting a little
D. Hurting a lot
E. Neither helping nor hurting
F. Refused
Answer: A
Log Probs: [[-0.8936105966567993]]
<｜begin▁of▁sentence｜><｜User｜>Question: Do you think China's current economic conditions are helping or hurting Hui people?
A. Helping a lot
B. Helping a little
C. Hurting a little
D. Hurting a lot
E. Neither helping nor hurting
F. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Do you think China's current economic conditions are helping or hurting Hui people?
A. Helping a lot
B. Helping a little
C. Hurting a little
D. Hurting a lot
E. Neither helping nor hurting
F. Refused
Answer: E
Log Probs: [[-1.3398102521896362]]
<｜begin▁of▁sentence｜><｜User｜>Question: When it comes to whether or not to limit Chinese students studying in the U.S., do you 
A. Strongly support limiting Chinese students
B. Somewhat support limiting Chinese students
C. Somewhat oppose limiting Chinese students
D. Strongly oppose limiting Chinese students
E. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: When it comes to whether or not to limit Chinese students studying in the U.S., do you 
A. Strongly support limiting Chinese students
B. Somewhat support limiting Chinese students
C. Somewhat oppose limiting Chinese students
D. Strongly oppose limiting Chinese students
E. Refused
Answer: D
Log Probs: [[-0.9192594885826111]]
<｜begin▁of▁sentence｜><｜User｜>Question: When it comes to whether or not to limit American students studying in China, do you 
A. Strongly support limiting American students
B. Somewhat support limiting American students
C. Somewhat oppose limiting American students
D. Strongly oppose limiting American students
E. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: When it comes to whether or not to limit American students studying in China, do you 
A. Strongly support limiting American students
B. Somewhat support limiting American students
C. Somewhat oppose limiting American students
D. Strongly oppose limiting American students
E. Refused
Answer: D
Log Probs: [[-1.0103249549865723]]
<｜begin▁of▁sentence｜><｜User｜>Question: Do you think independent civil society is generally good or bad for our country?
A. Very good for society
B. Somewhat good for society
C. Neither good nor bad for society
D. Somewhat bad for society
E. Very bad for society
F. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Do you think independent civil society is generally good or bad for our country?
A. Very good for society
B. Somewhat good for society
C. Neither good nor bad for society
D. Somewhat bad for society
E. Very bad for society
F. Refused
Answer: A
Log Probs: [[-0.8360346555709839]]
<｜begin▁of▁sentence｜><｜User｜>Question: Do you think protests are generally good or bad for our society?
A. Very good for society
B. Somewhat good for society
C. Neither good nor bad for society
D. Somewhat bad for society
E. Very bad for society
F. Refused
Answer:<｜Assistant｜><think>



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Question: Do you think protests are generally good or bad for our society?
A. Very good for society
B. Somewhat good for society
C. Neither good nor bad for society
D. Somewhat bad for society
E. Very bad for society
F. Refused
Answer: B
Log Probs: [[-1.3965107202529907]]


In [30]:
df=pd.DataFrame()
df['question']=evals
df['reasoning_responses']=reasoning_responses
df['top_k_logprobs']=reasoning_logprobs

In [32]:
df.to_csv('DeepSeek_R1_Qwen32B_distill_UVA_question_responses.csv',index=False)

In [33]:
df.to_pickle('DeepSeek_R1_Qwen32B_distill_UVA_question_responses.pkl')