In [4]:
import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer

model_name = './llama-2-7b-hf'

tokenizer = LlamaTokenizer.from_pretrained(model_name)

import subprocess
import sys
import pandas as pd
import io

def get_free_gpu():
    gpu_stats = subprocess.check_output(["nvidia-smi", "--format=csv", "--query-gpu=memory.used,memory.free"])
    gpu_stats = gpu_stats.decode('utf-8')
    gpu_df = pd.read_csv(io.StringIO(gpu_stats))
    gpu_df["memory.free"] = gpu_df[' memory.free [MiB]']
    gpu_df['memory.free'] = gpu_df['memory.free'].map(lambda x: x.rstrip(' [MiB]')).astype('float32')
    idx = gpu_df['memory.free'].idxmax()
    print('Returning GPU{} with {} free MiB'.format(idx, gpu_df.iloc[idx]['memory.free']))
    return idx
device = torch.device("cuda:" + str(get_free_gpu()) if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

Returning GPU2 with 16063.0 free MiB


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.45s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 2 has a total capacity of 31.74 GiB of which 119.38 MiB is free. Process 1834150 has 8.00 GiB memory in use. Process 1852644 has 7.98 GiB memory in use. Including non-PyTorch memory, this process has 15.57 GiB memory in use. Of the allocated memory 15.27 GiB is allocated by PyTorch, and 1.40 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [2]:
def get_perplexity(document , file):
    p = []
    count = 0
    for text in document:
        encodings = tokenizer("".join(text), return_tensors="pt")
        max_length = model.config.max_length
        stride = 1024
        seq_len = encodings.input_ids.size(1)
        nlls = []
        prev_end_loc = 0
        for begin_loc in (range(0, seq_len, stride)):
            end_loc = min(begin_loc + max_length, seq_len)
            trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = model(input_ids, labels=target_ids)
                neg_log_likelihood = outputs.loss

            nlls.append(neg_log_likelihood)

            prev_end_loc = end_loc
            if end_loc == seq_len:
                break

        ppl = torch.exp(torch.stack(nlls , dim = 0))
        p.append({f"per_of_{file}": ppl.item() , f"seq_of_{file}" : seq_len})
    return p


In [1]:

import os 
import pickle
import traceback
directory = "Dataset/Attacked/"

directories = ['Dataset/Attacked/NewData/']
recursive_para = {}
for directory in directories:
        for folder in os.listdir(directory):
            print(folder)
            for subfolder in os.listdir(directory + folder):
                print("-->" , subfolder)
                try :
                    for i , files in enumerate(os.listdir(directory  + folder + "/" + subfolder)):
                        file = f"{directory}/{folder}/{subfolder}/{files}"
                        recursive_para[file] = recursive_para.get(file , [])
                    
                        with open(file , "rb") as f:
                                data = pickle.load(f)
                        recursive_para[file].append(get_perplexity(data , f"{folder}_{subfolder}_{i}"))
                except Exception as e:
                    print(traceback.format_exc())
        

Paraphraed_PivotTranslation
--> semantics
----> Dataset/Attacked/NewData//Paraphraed_PivotTranslation/semantics/llm_watermarked_semantics_pivot_translated.pkl
Traceback (most recent call last):
  File "/tmp/ipykernel_2277690/4008941631.py", line 21, in <module>
    recursive_para[file].append(get_perplexity(data , f"{folder}_{subfolder}_{i}"))
NameError: name 'get_perplexity' is not defined

--> sir
----> Dataset/Attacked/NewData//Paraphraed_PivotTranslation/sir/llm_watermarked_sir_pivot_translated.pkl
Traceback (most recent call last):
  File "/tmp/ipykernel_2277690/4008941631.py", line 21, in <module>
    recursive_para[file].append(get_perplexity(data , f"{folder}_{subfolder}_{i}"))
NameError: name 'get_perplexity' is not defined

--> kwg
----> Dataset/Attacked/NewData//Paraphraed_PivotTranslation/kwg/llm_watermarked_kwg_pivot_translated.pkl
Traceback (most recent call last):
  File "/tmp/ipykernel_2277690/4008941631.py", line 21, in <module>
    recursive_para[file].append(get_perp

In [22]:
dfs = []
for key in recursive_para.keys():
    dfs.append(pd.DataFrame(recursive_para[key][0]))
df = pd.concat(dfs , axis = 1)
# pd.set_option('display.max_rows', 10)
df

Unnamed: 0,per_of_RERERETE_semantics_0,seq_of_RERERETE_semantics_0,per_of_RERERETE_semantics_1,seq_of_RERERETE_semantics_1,per_of_RERERETE_semantics_2,seq_of_RERERETE_semantics_2,per_of_RERERETE_semantics_3,seq_of_RERERETE_semantics_3,per_of_RERERETE_semantics_4,seq_of_RERERETE_semantics_4,...,per_of_RNormalTranslation_kwg_0,seq_of_RNormalTranslation_kwg_0,per_of_RNormalTranslation_sir_0,seq_of_RNormalTranslation_sir_0,per_of_RPivotTranslation_semantics_0,seq_of_RPivotTranslation_semantics_0,per_of_RPivotTranslation_kwg_0,seq_of_RPivotTranslation_kwg_0,per_of_RPivotTranslation_sir_0,seq_of_RPivotTranslation_sir_0
0,49.683483,179,97.792519,146,94.723030,195,12.597280,288,15.162140,213,...,40.119129,303,40.119129,197,33.094795,227,33.094795,224,33.094795,201
1,35.493668,72,48.557602,72,29.088301,81,61.883507,122,38.476810,94,...,69.377960,239,287.683105,340,74.275902,161,74.686523,230,66.842415,137
2,18.961502,152,42.149338,100,32.718262,162,13.673713,171,17.271112,166,...,11.553406,330,11.553406,495,43.565704,250,44.815575,243,11.553406,198
3,24.140652,67,29.432497,46,28.021360,68,24.140652,93,26.303492,63,...,34.435089,231,50.323181,84,60.475807,93,85.276672,166,21.317049,242
4,51.526081,76,45.434662,78,60.044567,82,48.849869,231,31.012699,213,...,71.921051,308,70.718254,117,170.488129,237,62.984909,223,71.921051,219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,71.033676,136,109.698463,140,71.033676,153,71.033676,208,79.333473,182,...,310.886993,272,179.795166,108,179.795166,217,179.795166,188,179.795166,246
96,14.939868,160,12.893393,153,26.084930,162,55.187366,207,38.400303,181,...,109.062309,308,53.918785,232,53.918785,208,89.439484,216,142.187988,201
97,88.797806,190,35.992794,139,21.363985,226,16.375364,311,16.375364,275,...,48.431953,173,48.885605,56,87.150803,304,48.431953,254,48.431953,237
98,215.114822,79,66.307434,82,124.288757,157,40.337471,222,40.977825,192,...,24.024097,170,63.339069,168,24.014111,233,63.339069,217,63.339069,217


In [19]:
df.to_csv('Per_ult1_ultimate.csv', index = False)

In [20]:
df.describe()

Unnamed: 0,per_of_RERERETE_semantics_0,seq_of_RERERETE_semantics_0,per_of_RERERETE_semantics_1,seq_of_RERERETE_semantics_1,per_of_RERERETE_semantics_2,seq_of_RERERETE_semantics_2,per_of_RERERETE_semantics_3,seq_of_RERERETE_semantics_3,per_of_RERERETE_semantics_4,seq_of_RERERETE_semantics_4,...,per_of_RNormalTranslation_kwg_0,seq_of_RNormalTranslation_kwg_0,per_of_RNormalTranslation_sir_0,seq_of_RNormalTranslation_sir_0,per_of_RPivotTranslation_semantics_0,seq_of_RPivotTranslation_semantics_0,per_of_RPivotTranslation_kwg_0,seq_of_RPivotTranslation_kwg_0,per_of_RPivotTranslation_sir_0,seq_of_RPivotTranslation_sir_0
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,45.053815,147.51,44.709146,135.64,40.937385,164.5,35.445877,242.25,37.459511,194.05,...,54.307177,283.83,53.122877,209.97,54.131676,229.4,53.320258,225.54,54.874228,223.76
std,39.004678,50.460405,35.830768,48.218886,34.417608,52.422237,25.036865,60.046678,28.224457,58.325519,...,48.773153,47.352761,46.072697,145.872606,47.450341,36.569845,41.06685,35.083859,47.350531,40.652288
min,5.695521,35.0,7.05897,35.0,6.69065,36.0,6.296433,93.0,6.296433,58.0,...,8.242137,92.0,8.007092,45.0,12.752916,93.0,9.323277,120.0,8.242326,126.0
25%,20.923317,119.75,23.944107,109.75,20.815385,140.75,16.91251,207.0,17.288947,153.5,...,25.597847,266.5,24.731002,90.75,28.824824,210.0,26.221711,203.75,22.732635,201.0
50%,33.428822,150.0,33.584003,134.5,30.664355,170.5,28.321187,251.0,28.518443,201.5,...,39.577257,295.0,39.840649,153.0,41.295092,227.5,41.261217,223.5,36.082155,225.5
75%,52.144745,182.25,50.783721,161.75,44.900572,198.25,46.55951,288.75,46.053268,238.25,...,62.597472,310.0,62.290608,329.75,58.352207,245.0,60.635207,243.0,64.751614,245.25
max,227.854034,341.0,234.310791,342.0,176.275238,345.0,123.838623,351.0,154.057709,368.0,...,310.886993,371.0,287.683105,528.0,361.002045,340.0,194.025314,349.0,203.917236,318.0


In [25]:
a = [i for i in df.columns if 'en' in i ]
df[a].describe()

Unnamed: 0,per_of_en_semantics_0,seq_of_en_semantics_0,per_of_en_kwg_0,seq_of_en_kwg_0,per_of_en_sir_0,seq_of_en_sir_0,per_of_inputs_en_0,seq_of_inputs_en_0
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,53.570123,316.77,25.124003,337.05,25.147391,239.39,25.124003,323.81
std,44.00011,66.014516,18.821298,50.669931,18.885883,170.50945,18.821298,67.598891
min,3.292984,106.0,3.292984,123.0,3.292984,45.0,3.292984,110.0
25%,25.920447,317.75,13.706877,333.75,13.442638,97.0,13.706877,319.75
50%,42.264744,342.5,19.033783,354.0,19.629847,164.5,19.033783,353.0
75%,64.192065,356.0,29.942201,365.0,30.351958,369.5,29.942201,363.0
max,286.005646,401.0,129.825012,407.0,129.825012,550.0,129.825012,407.0


per_of_RERERETE_semantics_0
seq_of_RERERETE_semantics_0
per_of_RERERETE_semantics_1
seq_of_RERERETE_semantics_1
per_of_RERERETE_semantics_2
seq_of_RERERETE_semantics_2
per_of_RERERETE_semantics_3
seq_of_RERERETE_semantics_3
per_of_RERERETE_semantics_4
seq_of_RERERETE_semantics_4
per_of_RERERETE_kwg_0
seq_of_RERERETE_kwg_0
per_of_RERERETE_kwg_1
seq_of_RERERETE_kwg_1
per_of_RERERETE_kwg_2
seq_of_RERERETE_kwg_2
per_of_RERERETE_kwg_3
seq_of_RERERETE_kwg_3
per_of_RERERETE_kwg_4
seq_of_RERERETE_kwg_4
per_of_RERERETE_SIR_0
seq_of_RERERETE_SIR_0
per_of_RERERETE_SIR_1
seq_of_RERERETE_SIR_1
per_of_RERERETE_SIR_2
seq_of_RERERETE_SIR_2
per_of_RERERETE_SIR_3
seq_of_RERERETE_SIR_3
per_of_RERERETE_SIR_4
seq_of_RERERETE_SIR_4
per_of_RNormalTranslation_semantics_0
seq_of_RNormalTranslation_semantics_0
per_of_RNormalTranslation_kwg_0
seq_of_RNormalTranslation_kwg_0
per_of_RNormalTranslation_sir_0
seq_of_RNormalTranslation_sir_0
per_of_ReTranslatedRecusrive_semantics_0
seq_of_ReTranslatedRecusrive_semant