In [1]:
import os
os.chdir("..")

In [9]:
import os
import json
import pandas as pd
from tqdm import tqdm

from transformers import pipeline
import torch
from torch.cuda import is_available

# Get NLI scores

In [17]:
config_list = [
    {
        'PATH': "./data/climate_change/",
        'NAME': "CCC",
        'SOURCE_COLUMN': "text",
        'TARGET_COLUMN': 'FSL_BART',
    }, {
        'PATH': "./data/topic_stance/",
        'NAME': "TS_topic",
        'SOURCE_COLUMN': "Tweet",
        'TARGET_COLUMN': 'FSL_BART_topic',
    }, {
        'PATH': "./data/topic_stance/",
        'NAME': "TS_stance",
        'SOURCE_COLUMN': "Tweet",
        'TARGET_COLUMN': 'FSL_BART_stance',
    }, {
        'PATH': "./data/depression/",
        'NAME': "D_BART",
        'SOURCE_COLUMN': "Sentence",
        'TARGET_COLUMN': 'FSL_BART',
    }
]

In [18]:
for i, config in enumerate(config_list):
    print('\t*', i, ':\t', config['NAME'])

	* 0 :	 CCC
	* 1 :	 TS_topic
	* 2 :	 TS_stance
	* 3 :	 D_BART


In [19]:
config_index = 0
config = config_list[config_index]
print(config['NAME'])

CCC


## Helper functions

In [15]:
class NLI_Classifier:

    def __init__(self, traits, model_name, source_column='text', target_column='ZSL'):
        if model_name is None:
            model_name ='facebook/bart-large-mnli'
        use_cuda = is_available()
        if use_cuda:
            print('Using GPU')
            self.classifier = pipeline("zero-shot-classification", model=model_name, device=0)
        else:
            self.classifier = pipeline("zero-shot-classification", model=model_name)
        self.source_column = source_column
        self.target_column = target_column
        self.traits = traits
        if len(self.traits) > 60:
            new_traits = list()
            classes = sorted(list(set([t[:3] for t in self.traits])))
            for c in classes:
                new_traits.append({k:self.traits[k] for k in self.traits if k[:3] == c})
            self.traits = new_traits
        else:
            self.traits = [self.traits]


    def zsl_multi_classifier(self, sequence, context):
        result_json = dict()
        results = self.classifier(sequence, context, hypothesis_template="{}", multi_label=True)
        for label, score in zip(results['labels'], results['scores']):
            result_json[label] = score
        return result_json


    def df_apply_ZSL(self, row):

        results = dict()
        for traits in self.traits:
            results.update(self.zsl_multi_classifier(row[self.source_column], list(traits.values())))

        row[self.target_column] = results
        return row


    def run(self, dataframe, out_file):
        number_lines = len(dataframe)
        chunksize = 12

        if out_file is None:
            out_file_valid = False

        elif isinstance(out_file, str):
            out_file_valid = True

            if os.path.isfile(out_file):
                already_done = pd.read_csv(out_file, names= COLUMNS + [self.target_column])
                start_line = len(already_done)

            else:
                already_done = pd.DataFrame().reindex(columns=dataframe.columns)
                start_line = 0

        else:
            print('ERROR: "out_file" is of the wrong type, expected str')

        for i in tqdm(range(start_line, number_lines, chunksize)):

            sub_df = dataframe.iloc[i: i + chunksize]
            sub_df = sub_df.apply(self.df_apply_ZSL, axis=1)
            already_done = already_done.append(sub_df)

            if out_file_valid:
                sub_df.to_csv(out_file, mode="a", header=False)

        return already_done

## Load data

In [12]:
train_df = pd.read_pickle(os.path.join(config['PATH'], 'training.pkl'))
test_df = pd.read_pickle(os.path.join(config['PATH'], 'testing.pkl'))

In [10]:
if config['NAME'] == "TS_topic": 
    with open(os.path.join(config['PATH'], 'claims_topic.json')) as file:
        claims = json.load(file)
        
elif config['NAME'] == "TS_stance": 
    with open(os.path.join(config['PATH'], 'claims_stance.json')) as file:
        claims = json.load(file)
        
else:
    with open(os.path.join(config['PATH'], 'claims.json')) as file:
        claims = json.load(file)

class_descr = claims["class_descr"]
del claims["class_descr"]

In [20]:
classifier = NLI_Classifier(claims, None, source_column=config['SOURCE_COLUMN'], target_column=config['TARGET_COLUMN'])

Downloading model.safetensors: 100%|███████████████████████████████████████████| 1.63G/1.63G [01:36<00:00, 16.8MB/s]


## Run NLI model

In [None]:
df_train_proc = classifier.run(df_train, os.path.join(config["PATH", "training.csv"]))

In [None]:
df_test_proc = classifier.run(df_test, os.path.join(config["PATH", "testing.csv"]))

## Convert file to pickle

In [None]:
df_train_proc = pd.read_csv(os.path.join(config["PATH", "training.csv"]))

In [None]:
df_train_proc[config['TARGET_COLUMN']] = df_train_proc[config['TARGET_COLUMN']].apply(eval)

In [None]:
df_train_proc.to_pickle(os.path.join(config["PATH", "training.pkl"]))

In [None]:
os.remove(os.path.join(config["PATH", "training.csv"]))

In [None]:
df_test_proc = pd.read_csv(os.path.join(config["PATH", "testing.csv"]))

In [None]:
df_test_proc[config['TARGET_COLUMN']] = df_test_proc[config['TARGET_COLUMN']].apply(eval)

In [None]:
df_test_proc.to_pickle(os.path.join(config["PATH", "testing.pkl"]))

In [None]:
os.remove(os.path.join(config["PATH", "testing.csv"]))