In [None]:
"""
ISEAR_demo.ipynb

Created on Fri Mar 24 2023

@author: Lukas

This file contains experiments with the ISEAR dataset
"""

'\nISEAR_demo.ipynb\n\nCreated on Fri Mar 24 2023\n\n@author: Lukas\n\nThis file contains experiments with the ISEAR dataset\n'

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [None]:
# import packages

import pandas as pd
import numpy as np
import os
import torch
import random

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from sklearn.metrics import mutual_info_score
from scipy.stats import pearsonr

In [None]:
! git clone # [link repo to repo]

%cd "/content/topic_and_content_classification"

import batch_functions as bf
import discriminative_active_learning as dal
import evaluate_retrieval as er
import h_divergence as hd
import keyword_retrieval as kr
import neural_retrieval as nr

from isear import *

Cloning into 'Predoc'...
remote: Enumerating objects: 445, done.[K
remote: Counting objects: 100% (175/175), done.[K
remote: Compressing objects: 100% (134/134), done.[K
remote: Total 445 (delta 70), reused 138 (delta 36), pack-reused 270[K
Receiving objects: 100% (445/445), 35.43 MiB | 18.05 MiB/s, done.
Resolving deltas: 100% (231/231), done.
/content/Predoc/topic_and_content_classification


In [None]:
# load the dataset
isear = load_isear()

Mounted at /content/drive


In [None]:
# run and evaluate keyword-based retrieval
kr_documents = kr.keyword_retrieval(['fear'], isear['SIT'])

true_positives = get_isear_positives(isear, 2)

print('Precision: ', er.compute_precision(true_positives, kr_documents))
print('Recall: ', er.compute_recall(true_positives, kr_documents))
print('F1: ', er.compute_f1_score(true_positives, kr_documents))

Precision:  0.9166666666666666
Recall:  0.12054794520547946
F1:  0.21307506053268765


In [None]:
# select texts and labels for training
sentences = load_isear_texts(isear, 0, 640)

labels = torch.tensor(load_binary_isear_labels(isear, 0, 640, 2))

In [None]:
# tokenize the input and create dataloaders
input_ids, attention_masks = nr.tokenize_dataset(sentences, 'bert')

train_dataloader, validation_dataloader = nr.create_dataloaders(nr.create_dataset(input_ids, attention_masks, labels), 16)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# create and train the model
model = nr.create_model('bert', 2)

trained_model = nr.train_bert(model, train_dataloader, validation_dataloader, 15)

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b


Training...

  Average training loss: 0.37

Training...

  Average training loss: 0.24

Training...

  Average training loss: 0.11

Training...

  Average training loss: 0.05

Training...

  Average training loss: 0.03

Training...

  Average training loss: 0.01

Training...

  Average training loss: 0.03

Training...

  Average training loss: 0.01

Training...

  Average training loss: 0.01

Training...

  Average training loss: 0.01

Training...

  Average training loss: 0.01

Training...

  Average training loss: 0.01

Training...

  Average training loss: 0.01

Training...

  Average training loss: 0.01

Training...

  Average training loss: 0.01
Training complete!


In [None]:
# save the model an clear the gpu
trained_model.cpu()

torch.save(trained_model.state_dict(), 'trained_model.pt')

In [None]:
# clear the GPU memory
torch.cuda.empty_cache()

In [None]:
# run inference on new texts
new_sentences = load_isear_texts(isear, 320, len(isear))

test_input_ids, test_attention_masks = nr.tokenize_dataset(new_sentences, 'bert')

model_output = nr.run_inference(trained_model, input_ids, attention_masks)

documents = nr.neural_retrieval(model_output)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# get the ground truth and evaluate the model output
true_positives = [index - 320 for index in get_isear_positives(isear, 2) if index > 320]

print('Precision: ', er.compute_precision(true_positives, documents))
print('Recall: ', er.compute_recall(true_positives, documents))
print('F1: ', er.compute_f1_score(true_positives, documents))

Precision:  0.7382978723404255
Recall:  0.660952380952381
F1:  0.6974874371859296


In [None]:
vars = ['SEX', 'AGE', 'COUN', 'RELI', 'PRAC', 'FIEL', 'WHEN', 'LONG', 'INTS']

for var in vars:
    kr_correlation = get_correlation(isear, var, 'false_positive', true_positives, kr_documents)
    nr_correlation = get_correlation(isear, var, 'false_positive', true_positives, documents)

    print("Correlation with " + str(var) + " using Keywords: ", kr_correlation)
    print("Correlation with " + str(var) + " using DNN:      ", nr_correlation)
    print("")

Correlation with SEX using Keywords:  PearsonRResult(statistic=0.0003871889438100025, pvalue=0.9729608238798139)
Correlation with SEX using DNN:       PearsonRResult(statistic=0.0035787051478939325, pvalue=0.7540634391682184)

Correlation with AGE using Keywords:  PearsonRResult(statistic=0.007403250276754253, pvalue=0.5169203296665537)
Correlation with AGE using DNN:       PearsonRResult(statistic=-0.007487016744364623, pvalue=0.5121888469061097)

Correlation with COUN using Keywords:  PearsonRResult(statistic=0.025946735344118652, pvalue=0.023098806279983956)
Correlation with COUN using DNN:       PearsonRResult(statistic=-0.03078753969494761, pvalue=0.007021477286163827)

Correlation with RELI using Keywords:  PearsonRResult(statistic=-0.00025427699840219187, pvalue=0.9822407401451533)
Correlation with RELI using DNN:       PearsonRResult(statistic=0.005988751983901959, pvalue=0.6000917703709396)

Correlation with PRAC using Keywords:  PearsonRResult(statistic=0.028343804010469785, 