In [None]:
# Load dataset
# Load transformer models

# Get same splits to use with LR classifier
# Get predictions from transformer models + LR

In [1]:
import os
import json
import pandas as pd
import numpy as np
import pprint as pp

import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from datasets import load_from_disk



In [2]:
PROJECT_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
SPLITS_DIR = os.path.join(PROJECT_DIR, "classification/split_datasets/coqa")
MODELS_DIR = os.path.join(PROJECT_DIR, "classification/models")

models_paths_map = {
    'distilbert': os.path.join(MODELS_DIR, "distilbert-base-uncased_13091207"),
    'bert': os.path.join(MODELS_DIR, "bert_14102004"),
    'bert-large': os.path.join(MODELS_DIR, "bert-large_14101938"),
    'roberta': os.path.join(MODELS_DIR, "roberta_14102014"),
    'deberta': os.path.join(MODELS_DIR, "deberta_14102242")
}

raw_dataset = load_from_disk(SPLITS_DIR)

device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

In [3]:
# get gold labels
gold_labels = {
    'validation': [0 if l==False else 1 for l in raw_dataset['validation']['label']],
    'test': [0 if l==False else 1 for l in raw_dataset['test']['label']]
}

In [3]:
def init_pipe_and_predict(path_to_existing_model):
    
    tokenizer = AutoTokenizer.from_pretrained(path_to_existing_model)
    model = AutoModelForSequenceClassification.from_pretrained(path_to_existing_model)
    model.to(device)

    pipe = pipeline("text-classification",
    tokenizer=tokenizer,
    model=model,
    #top_k=None, # get confidence scores for predictions
    # `return_all_scores` is now deprecated,  if want a similar funcionality use `top_k=None` instead of `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.
    )
    pipe.device = device

    val_preds = pipe(raw_dataset['validation']['text'])
    test_preds = pipe(raw_dataset['test']['text'])
    
    preds = {
        'validation': [
            int(pred['label'].split('_')[-1]) for pred in val_preds
        ],
        'test': [
            int(pred['label'].split('_')[-1]) for pred in test_preds
        ]
    }

    return preds

In [4]:
distilbert_preds = init_pipe_and_predict(models_paths_map['distilbert'])
bert_preds = init_pipe_and_predict(models_paths_map['bert'])
bert_large_preds = init_pipe_and_predict(models_paths_map['bert-large'])
roberta_preds = init_pipe_and_predict(models_paths_map['roberta'])
deberta_preds = init_pipe_and_predict(models_paths_map['deberta'])

In [42]:
pp.pprint(distilbert_preds, compact=True, width=200)

{'test': [1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
          1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
          1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
          1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
     

In [17]:
''' Sanity check: pandas_idx matches with order in responses-fe (and features file) '''
for i, instance in enumerate(raw_dataset['train']):
    if raw_dataset['train'][i]['pandas_idx'] <= 5:
        print(i)
        pp.pprint(instance, compact=True)
for i, instance in enumerate(raw_dataset['test']):
    if raw_dataset['test'][i]['pandas_idx'] <= 5:
        print(i)
        pp.pprint(instance, compact=True)

3404
{'label': False,
 'pandas_idx': 2,
 'text': 'First, if the choker is not in a jewelry box or boutique, it must be '
         'somewhere else. So, we need to consider other possible locations. \n'
         '\n'
         'Some options could be:\n'
         '\n'
         '- Clothing stores: Some clothing stores may sell accessories like '
         "chokers, so it's worth checking out.\n"
         '- Online marketplaces: You can search for chokers on online '
         'marketplaces like Amazon, Etsy, or eBay.\n'
         '- Second-hand stores: You may be able to find a unique choker at a '
         'second-hand store or thrift shop.\n'
         "- Craft stores: If you're feeling creative, you can make your own "
         'choker by purchasing supplies at a craft store.\n'
         '\n'
         'So, the answer would be none of the above options listed in the '
         'question.'}
6521
{'label': True,
 'pandas_idx': 1,
 'text': 'Sammy wants to go where the people are. This means he i

In [None]:
responses_path = '/mount/studenten-temp1/users/dpgo/xai-thesis/responses/12091031_parsed_turbo_10000_eval.jsonl'
features_all_path = '/mount/studenten-temp1/users/dpgo/xai-thesis/feature_extraction/12091031_all_features.csv.gz'

In [8]:
# make dummy df - data_df in lr_classifier.py
df = pd.DataFrame(columns=['f1', 'f2', 'f3', 'outcome'])
df.loc[0] = [0.1, 0.2, 0.3, False]
df.loc[1] = [0.4, 0.5, 0.6, True]
df.loc[2] = [0.7, 0.8, 0.9, False]
df.loc[3] = [0.7, 0.8, 0.9, False]
df.loc[4] = [0.7, 0.8, 0.9, False]
df.loc[5] = [0.7, 0.8, 0.9, False]
df

Unnamed: 0,f1,f2,f3,outcome
0,0.1,0.2,0.3,False
1,0.4,0.5,0.6,True
2,0.7,0.8,0.9,False
3,0.7,0.8,0.9,False
4,0.7,0.8,0.9,False
5,0.7,0.8,0.9,False


In [17]:
# dummy raw_dataset with only 2 instances per split
from datasets import Dataset
dummy_raw_dataset = Dataset.from_dict({
    'train': [
        {'pandas_idx': 0, 'label': False},
        {'pandas_idx': 1, 'label': True},
    ],
    'validation': [
        {'pandas_idx': 2, 'label': False},
        {'pandas_idx': 3, 'label': False},
    ],
    'test': [
        {'pandas_idx': 4, 'label': False},
        {'pandas_idx': 5, 'label': False},
    ]
})
dummy_raw_dataset

Dataset({
    features: ['train', 'validation', 'test'],
    num_rows: 2
})

In [18]:
dummy_raw_dataset['train'][0]['pandas_idx']

0

In [21]:
# make train_df
# it is a subset of data_df which only has the instances that are in the train split of dummy_raw_dataset

# make list of pandas_idx in train split
train_pandas_idx = [i['pandas_idx'] for i in dummy_raw_dataset['train']]

# make train_df
train_df = df[df.index.isin(train_pandas_idx)]
train_df

Unnamed: 0,f1,f2,f3,outcome
0,0.1,0.2,0.3,False
1,0.4,0.5,0.6,True


In [5]:
import sys

sys.path.append('../')

In [11]:
raw_dataset['test'].to_pandas

<bound method Dataset.to_pandas of Dataset({
    features: ['text', 'label', 'pandas_idx'],
    num_rows: 1000
})>

In [12]:
raw_dataset['test'].to_pandas()

Unnamed: 0,text,label,pandas_idx
0,"First, we need to identify the geographic feat...",True,4343
1,"The question states that ""Everyone is ordinary...",True,1794
2,"First, we know that Sam didn't like the people...",False,708
3,"After falling, the person may feel pain or dis...",False,3383
4,"When you travel with instruments, you need to ...",True,4534
...,...,...,...
995,The juror was quite bored and zoning out but w...,False,9920
996,"First, the person was ""well ahead"" at the casi...",False,6123
997,"The sentence mentions that ""they were trying t...",True,7516
998,The windshield is a part of a vehicle that pro...,True,6746
