In [1]:
import os
os.chdir("..")
os.getcwd()

'/Users/sandrinechausson/Documents/easyclaimsdetection'

In [13]:
import os
import json
import pandas as pd
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
import torch
from torch.cuda import is_available
from numpy import dot
from numpy.linalg import norm

# Get NLI scores

In [14]:
config_list = [
    {
        'PATH': "./data/climate_change/",
        'NAME': "CCC",
        'SOURCE_COLUMN': "text",
        'TARGET_COLUMN': 'sbert_cosine',
    }, {
        'PATH': "./data/topic_stance/",
        'NAME': "TS_topic",
        'SOURCE_COLUMN': "Tweet",
        'TARGET_COLUMN': 'sbert_cosine',
    }, {
        'PATH': "./data/topic_stance/",
        'NAME': "TS_stance",
        'SOURCE_COLUMN': "Tweet",
        'TARGET_COLUMN': 'sbert_cosine',
    }, {
        'PATH': "./data/depression/",
        'NAME': "D_BART",
        'SOURCE_COLUMN': "Sentence",
        'TARGET_COLUMN': 'sbert_cosine',
    }
]

In [15]:
for i, config in enumerate(config_list):
    print('\t*', i, ':\t', config['NAME'])

	* 0 :	 CCC
	* 1 :	 TS_topic
	* 2 :	 TS_stance
	* 3 :	 D_BART


In [16]:
config_index = 0
config = config_list[config_index]
print(config['NAME'])

CCC


## Helper functions

In [17]:
class SBERT_Classifier:

    def __init__(self, model_name, source_column, column_name='sbert_embedding'):
        if model_name is None:
            model_name = 'all-mpnet-base-v2'
        use_cuda = is_available()
        if use_cuda:
            print('Using GPU')
            self.classifier = SentenceTransformer(model_name, device='cuda')
        else:
            print("Using CPU")
            self.classifier = SentenceTransformer(model_name)
        self.target_column = column_name
        self.source_column = source_column

    def df_apply_sbert(self, sub_df):
        texts = sub_df[self.source_column].to_list()
        embeddings = list(self.classifier.encode(texts))
        # results = [{t: e} for t, e in zip(texts, embeddings)]
        sub_df[self.target_column] = embeddings
        return sub_df


    def run(self, dataframe):
        number_lines = len(dataframe)
        chunksize = 16
        already_done = pd.DataFrame().reindex(columns=dataframe.columns)
        start_line = 0

        for i in tqdm(range(start_line, number_lines, chunksize)):

            sub_df = dataframe.iloc[i: i + chunksize]
            sub_df = self.df_apply_sbert(sub_df)
            already_done = already_done.append(sub_df)

        return already_done

## Load data

In [18]:
train_df = pd.read_pickle(os.path.join(config['PATH'], 'training.pkl'))
test_df = pd.read_pickle(os.path.join(config['PATH'], 'testing.pkl'))

In [19]:
if config['NAME'] == "TS_topic": 
    with open(os.path.join(config['PATH'], 'claims_topic.json')) as file:
        claims = json.load(file)
        
elif config['NAME'] == "TS_stance": 
    with open(os.path.join(config['PATH'], 'claims_stance.json')) as file:
        claims = json.load(file)
        
else:
    with open(os.path.join(config['PATH'], 'claims.json')) as file:
        claims = json.load(file)

class_descr = claims["class_descr"]
del claims["class_descr"]

In [20]:
classifier = SBERT_Classifier(None, source_column=config['SOURCE_COLUMN'])

Using CPU


## Run NLI model

In [None]:
df_train_proc = classifier.run(df_train, os.path.join(config["PATH", "training.csv"]))

In [None]:
df_test_proc = classifier.run(df_test, os.path.join(config["PATH", "testing.csv"]))

## Get claims' embeddings

In [21]:
embeddings = list(classifier.classifier.encode(list(claims.values())))

In [24]:
claims_embs = dict()

for claim, emb in zip(claims, embeddings):
    claims_embs[claim] = emb

## Calculate cosine similarity

In [None]:
def get_cosine_sim(embedding, claims, claims_embs):
    results = dict()
    for c in claims:
        results[claims[c]] = cosine_similarity(claims_embs[c], embedding)
    return results

def cosine_similarity(a, b):
    return dot(a, b)/(norm(a)*norm(b))

In [None]:
df_train_proc[config['TARGET_COLUMN']] = df_train_proc["sbert_embedding"].apply(lambda x: get_cosine_sim_per_topic_claim(x, claims, claims_embs))

In [None]:
df_test_proc[config['TARGET_COLUMN']] = df_test_proc["sbert_embedding"].apply(lambda x: get_cosine_sim_per_topic_claim(x, claims, claims_embs))

## Save files

In [None]:
df_train_proc.to_pickle(os.path.join(config["PATH", "training.pkl"]))

In [None]:
df_test_proc.to_pickle(os.path.join(config["PATH", "testing.pkl"]))