In [40]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch, json
import random
import numpy as np
import tqdm
from utils_batch import InfillingModel
from torch.nn.functional import log_softmax
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc



device = "cuda:1" # for GPU usage or "cpu" for CPU usage

In [41]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce GTX 1050 Ti with Max-Q Design
Memory Usage:
Allocated: 0.7 GB
Cached:    2.5 GB


In [42]:
def plot_roc_curve(human_scores, gpt_scores):
    # Data
    A, B = human_scores, gpt_scores
    # Combine scores and true labels
    scores = A + B
    labels = [0] * len(A) + [1] * len(B)
    # check amount of NaNs in scores
    if np.isnan(human_scores).any():
        print("Warning: NaNs in scores")
        print(np.argwhere(np.isnan(scores)))
    if np.isnan(gpt_scores).any():
        print("Warning: NaNs in scores")
        print(np.argwhere(np.isnan(scores)))
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(labels, scores)
    # Calculate AUC (Area Under Curve)
    roc_auc = auc(fpr, tpr)
    # Plot ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.4f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve: Open-gen w/ GPT3.5-Reddit w prompts' )
    plt.legend(loc="lower right")
    plt.show()
    # what is the TPR for FPR = 0.1?
    for idx, fpr_ in enumerate(fpr):
        if fpr_ > 0.1:
            print(f"TPR at 1% FPR: {tpr[idx]:.4f}")
            break
    return roc_auc, tpr[idx]

In [43]:
model_name = "NinedayWang/PolyCoder-160M"
# NinedayWang/PolyCoder-0.4B
# NinedayWang/PolyCoder-2.7B
# model_name = 'codeparrot/codeparrot'
PyCodeGPT = AutoModelForCausalLM.from_pretrained( model_name ).to(device)
PyCodeGPT_tokenizer = AutoTokenizer.from_pretrained( model_name )

In [44]:
# from transformers import LlamaForCausalLM, LlamaTokenizer
# model_name = "/data/xianjun/project/llama/7B_hf/"
# model = LlamaForCausalLM.from_pretrained( model_name ).half().to(device) #.half() to use FP16
# model.eval() 
# PyCodeGPT = model
# PyCodeGPT_tokenizer = LlamaTokenizer.from_pretrained( model_name ) #.half() to use FP16

In [45]:
from transformers import AutoConfig
# Load the model's configuration
config = AutoConfig.from_pretrained(model_name)
# Get the default max_length
max_length = config.max_position_embeddings
max_length

2048

In [46]:
# give an input, return the logits of input tokens
inputs = 'this is a test'
truncate_ratio=0.9
def get_logprob1(inputs):
    input_ids = PyCodeGPT_tokenizer.encode(inputs, return_tensors='pt').to(device)
    input_ids = input_ids[:, :max_length]
    with torch.no_grad():
        output = PyCodeGPT(input_ids)
    logits = output[0]
    # Assuming the `logits` tensor contains the output from the model
    log_probs = log_softmax(logits, dim=-1)
    # Select the log probabilities for the specific tokens in the input
    input_log_probs = log_probs[0, torch.arange(log_probs.size(1)), input_ids[0]]
    # Multiply by -1 to get the negative log probabilities
    neg_log_probs = -input_log_probs
    neg_log_probs = neg_log_probs.cpu().numpy().tolist()
    return np.average( neg_log_probs[int( truncate_ratio*len(neg_log_probs)): ] ), neg_log_probs

In [47]:
#### merge the datasets
with open('FIM/codebert_interview_test_FIM_human_combined_line_8_per_56.jsonl', 'r') as f:
    data1  = [json.loads(line) for line in f]
len(data1)

119

In [48]:
# sample couple of lines and json dump them
machine_text = data1[2]
print(json.dumps(machine_text, indent=4))

{
    "code": "n = int(input())\na = list(map(int,input().split()))\nx = 1\ny = 10**9\nif n == 1:\n    print('YES')\n    print(y,x)\nelse:\n    t = 0\n    for i in range(1,n):\n        s = a[i]-a[i-1]\n        if s != 1 and s != -1:\n            s = max(s,-s)\n            if (x != 1 and x != s) or s == 0:\n                print('NO')\n                t = 1\n                break\n            x = s\n    if t == 0 and x > 1:\n        for i in range(1,n):\n            if (a[i] % x == 0 and a[i-1] == a[i]+1) or (a[i-1] % x == 0 and a[i] == a[i-1]+1):\n                print('NO')\n                t = 1\n                break\n    if t == 0:\n        print('YES')\n        print(y,x)",
    "label": 0,
    "label_name": "human_written",
    "difficulty": "interview",
    "original_source": "results\\gemma-7b-it-apps_interview_207.jsonl",
    "problem_id": "166",
    "question": "Provide me the Python3 codes for solving the question: There is a matrix A of size x \u00d7 y filled with integers. 

In [49]:
human_sample = data1[6]
human_text = human_sample['code']
print(json.dumps(human_sample, indent=4))

{
    "code": "from collections import Counter\nbits = (10**18).bit_length()\nn, k = map(int, input().split())\nnum = Counter(i for i in range(bits) if (n >> i) & 1)\nk -= len(num)\nif k >= 0:\n\tprint('Yes')\n\tfor i in range(bits, -bits, -1):\n\t\tif num[i] > k: break\n\t\tnum[i-1] += num[i] * 2\n\t\tk -= num.pop(i, 0)\n\ti = next(filter(num.get, range(-bits, bits)))\n\tfor k in range(k):\n\t\tnum[i] -= 1\n\t\tnum[i-1] += 2\n\t\ti -= 1\n\ts = sorted(num.elements(), reverse=True)\n\tprint(' '.join(map(str, s)))\nelse:\n\tprint('No')",
    "label": 0,
    "label_name": "human_written",
    "difficulty": "interview",
    "original_source": "results\\gemma-7b-it-apps_interview_207.jsonl",
    "problem_id": "55",
    "question": "Provide me the Python3 codes for solving the question: Jamie is preparing a Codeforces round. He has got an idea for a problem, but does not know how to solve it. Help him write a solution to the following problem:\n\nFind k integers such that the sum of two to t

In [50]:
len(data1[0]['FIM_code'])

56

In [51]:
avg_neg_log_probs, neg_log_probs = get_logprob1(human_text)
neg_log_probs[-500:]

[7.7765421867370605,
 11.395298957824707,
 10.865435600280762,
 13.264237403869629,
 1.3811554908752441,
 12.703011512756348,
 11.06181812286377,
 7.094317436218262,
 9.777111053466797,
 14.705423355102539,
 11.376411437988281,
 15.717781066894531,
 10.995134353637695,
 19.216289520263672,
 20.137516021728516,
 12.427679061889648,
 1.5436408519744873,
 13.915596961975098,
 10.90930461883545,
 17.735219955444336,
 12.18374252319336,
 16.599407196044922,
 12.371841430664062,
 12.969045639038086,
 14.490755081176758,
 14.261247634887695,
 14.562934875488281,
 12.88753604888916,
 16.015947341918945,
 1.8765432834625244,
 12.981420516967773,
 11.490097999572754,
 17.503171920776367,
 12.983759880065918,
 15.465143203735352,
 13.319483757019043,
 13.573202133178711,
 7.395870685577393,
 13.796298027038574,
 12.160466194152832,
 16.943201065063477,
 16.694782257080078,
 12.390918731689453,
 8.827608108520508,
 16.526962280273438,
 15.833083152770996,
 14.20888614654541,
 14.790316581726074,
 

In [52]:
# give an input, return the logits of input tokens
truncate_ratio=0.99
def get_logprob(inputs):
    input_ids = PyCodeGPT_tokenizer.encode(inputs, return_tensors='pt').to(device)
    input_ids = input_ids[:, :max_length]
    with torch.no_grad():
        output = PyCodeGPT(input_ids)
    logits = output[0]
    # Assuming the `logits` tensor contains the output from the model
    log_probs = log_softmax(logits, dim=-1)
    # Select the log probabilities for the specific tokens in the input
    input_log_probs = log_probs[0, torch.arange(log_probs.size(1)), input_ids[0]]
    # Multiply by -1 to get the negative log probabilities
    neg_log_probs = -input_log_probs
    neg_log_probs = neg_log_probs.cpu().numpy().tolist()
    return np.average( neg_log_probs[int( truncate_ratio*len(neg_log_probs)): ] )

In [64]:
prob_all = []
for id, ins in tqdm.tqdm(enumerate(data1), total=len(data1)):
    temp = []
    if len( ins['FIM_code']) > 1:
        original_score = get_logprob(ins['code'])
        miu_scores = []
        for i in range( len(ins['FIM_code'] ) ):
            if not  ins['FIM_code'][i]['completed']:
                continue
            one_regen = ins['FIM_code'][i]['text']
            miu_scores.append(get_logprob( one_regen ))
        miu_scores_average_score = np.ma.average( miu_scores )
        dx = original_score - miu_scores_average_score
        prob_all.append(dx)

 39%|███▉      | 47/119 [01:08<01:45,  1.47s/it]


RuntimeError: CUDA error: an illegal instruction was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [54]:

    # gold_prob_all = []
    # for id, ins in tqdm.tqdm(enumerate(data1), total=len(data1)):
    #     temp = []
    #     if len( ins['human_gen_text']['choices'] ) > 1:
    #         original_score = get_logprob( ins['gold_completion'] ) - get_logprob( ins['human_prefix_prompt'] )
    #         miu_scores = []
    #         for i in range( len(ins['human_gen_text']['choices'] ) ):
    #             one_regen = ins['human_prefix_prompt'] + ins['human_gen_text']['choices'][i]['message']['content']
    #             miu_scores.append( get_logprob( one_regen ) - get_logprob( ins['human_prefix_prompt'] ) )
    #         miu_scores_average_score = np.average( miu_scores )
    #         dx = original_score - miu_scores_average_score
    #         gold_prob_all.append( dx )
   

In [61]:
prob_all

[3.533696227981931,
 -2.400277553392308,
 2.690814860066851,
 2.525752275640315,
 -2.1271651669272353,
 6.038428385059039,
 1.2674132795079984,
 1.3017442576739242,
 3.3313146517094623,
 4.996229716709681,
 -0.24602565765380824,
 3.4676328305178483,
 0.7867422621883815,
 -0.219368015016828,
 3.7415604659746773,
 2.5951541572146937,
 1.5688024376377925,
 4.05504690806071,
 3.0599955972635513,
 0.6212443422388141,
 3.363723801977841,
 2.65057686033607,
 0.399252839720976,
 -2.293603616546501,
 -1.8913909878049573,
 3.623161866809383,
 4.461413056055704,
 3.934613739450773,
 0.22344718660627194,
 4.237811704476677,
 2.0145763057146873,
 3.2908351761954187,
 4.780990141017691,
 5.847730254789568,
 1.534677803516388,
 1.1072322492246265,
 1.3789558894193936,
 3.8242157760419335,
 2.081522485356274,
 -2.3047242571289317,
 2.7981544367472324,
 2.7345598674955838,
 1.9969412277142204,
 -0.20657241821288963,
 -4.42890865361249,
 5.037077218836004,
 2.1914856737835056,
 0.244255052546702,
 -0.93

In [63]:
print(len(prob_all))

115


In [62]:
human_scores = []
machine_scores = []
for ind, ins in tqdm.tqdm(enumerate(data1), total=len(data1)):
    #check for nan
    if np.isnan(prob_all[ind]):
        continue
    if ins['label'] == 0:
        human_scores.append(prob_all[ind])
    else:
        machine_scores.append(prob_all[ind])
        

 97%|█████████▋| 115/119 [00:00<00:00, 701082.79it/s]


IndexError: list index out of range

In [None]:
print(len(human_scores))
print(human_scores)

In [None]:
print(len(machine_scores))
print(machine_scores)

In [None]:
# plot and give different colors
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
plt.plot(sorted(human_scores), label='human')
plt.plot(sorted(machine_scores), label='gpt')
plt.legend()
plt.show()

In [None]:


plot_roc_curve(human_scores, machine_scores)

In [None]:
# count negative values
print( len( [i for i in human_scores if i < 0] ) )
print( len( [i for i in machine_scores if i < 0] ) )

In [None]:
print(gold_prob_all)