In [1]:
from code_new.dataset.probe import complexity_function_datast
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from code_new.repe import repe_pipeline_registry
from code_new.inferencer.decision_maker import DecisionMaker
from sklearn.metrics import classification_report
repe_pipeline_registry()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Auxillary import
import numpy as np
import pandas as pd
import json
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import mode
import random

In [3]:
model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.1"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto")
use_fast_tokenizer = "LlamaForCausalLM" not in model.config.architectures
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=use_fast_tokenizer, padding_side="left",
                                          legacy=False)
tokenizer.pad_token_id = 0

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.57s/it]


In [4]:
rep_token = -1 # Rep token set to -1 for now. 
hidden_layers = list(range(-1, -model.config.num_hidden_layers, -1)) # list of hidden layers
n_difference = 1
direction_method = 'pca' # Principal component analysis
rep_reading_pipeline = pipeline("rep-reading", model=model, tokenizer=tokenizer) # initiate a pipeline

Device set to use cuda:0


In [5]:
sample_size = 300
random_state = 0
# random_state = random.randint(0, 100)
# print(random_state)
dataset = complexity_function_datast(data_path="data/quantemp_retrieved_evidence/generated using gpt/train_retrieved_with_gpt.json", sample_size=sample_size, seed=random_state)

In [8]:
# Train a rep reader training data
feature_embedding = {}
for complexity in ["simple_claims", "intermediate_claims", "complex_claims"]:
    feature_embedding[complexity] = rep_reading_pipeline.get_directions(
        dataset[complexity],
        rep_token=rep_token,
        hidden_layers=hidden_layers,
        n_difference=n_difference,
        direction_method=direction_method,
        batch_size=32,
        mean_pool = "mean_pooling"
    ).directions

#### Compute complexity of test dataset

In [9]:
decision_model = DecisionMaker(feature_embedding=feature_embedding)

In [10]:
test_dataset = complexity_function_datast(data_path="data/quantemp_gold/test_claims_quantemp.json", test = True)
test_embedding = rep_reading_pipeline._batched_string_to_hiddens(test_dataset["claims"], rep_token, hidden_layers, batch_size = 32)

layer_list = []
for layer in test_embedding.keys():
    layer_list.append(test_embedding[layer])
test_embedding = np.transpose(np.array(layer_list), (1,0,2))
test_embedding.shape

(2495, 31, 4096)

In [11]:
final_decision_list = decision_model.make_decision(test_embedding=test_embedding)

100%|██████████| 2495/2495 [00:13<00:00, 183.90it/s]


In [12]:
unique_elements, counts = np.unique(final_decision_list, return_counts=True)
element_counts = dict(zip(unique_elements, counts))

In [13]:
element_counts

{0: 2072, 1: 191, 2: 232}

In [14]:
with open(f"output/gpt_decomp_decision.txt", "w") as fp:
    # Convert each element to a string
    fp.writelines(f"{str(item)}\n" for item in final_decision_list)
    fp.close()

#### Some EDA with the assigned complexity

In [15]:
# with open("output/decomposition_decision_adjusted.txt", "r") as fp:
#     decision = [int(item) for item in fp]
# fp.close()

In [16]:
with open("data/quantemp_gold/test_claims_quantemp.json", "r") as fp:
    data = pd.DataFrame(json.load(fp))
fp.close()
data["complexity"] = final_decision_list

In [33]:
label = "True"
len(data[(data["complexity"] == 1) & (data["label"] == label)])

13

In [None]:
# with open("data/quantemp_retrieved_evidence/train_new_adjusted.json", "r") as fp:
#     train_data = pd.DataFrame(json.load(fp))
# fp.close()

In [None]:
# label = "True"
# len(train_data[(train_data["complexity"] == 2) & (train_data["label"] == label)])

#### Data visualization and preparation for training. Ignore/remove later

In [None]:
# with open("data/quantemp_retrieved_evidence/generated using gpt/train_retrieved_with_gpt.json", "r") as fp:
#     data = json.load(fp)
# fp.close()
# for idx, sample in enumerate(data):
#     data[idx]['complexity'] = min(sample['complexity'], 2)

In [None]:
# with open("data/quantemp_retrieved_evidence/generated using gpt/train_retrieved_with_gpt.json", "w") as fp:
    # json.dump(data, fp, indent=4)

In [None]:
# complexity = 2
# label = "False"
# len(data[(data["complexity"] == complexity) & (data["label"] == label)])