In [38]:
# Bsic imports.
from collections import defaultdict, Counter
from datasets import load_dataset, concatenate_datasets
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import transformers
from transformers import AutoModel, AutoTokenizer

# Evaluation.
from sklearn.metrics import classification_report, accuracy_score

# Classifiers.
from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
import sentiment_bert_classifier
from sentiment_bert_classifier import BertClassifier

# Data selection methods.
from data_selection import HashedNgramDSIR

## Model setup

In [5]:
# Initiate the calssifier with fixed hyperprarmeters.
bert_finetune = BertClassifier(
    weights_name="prajjwal1/bert-mini",
    hidden_activation=nn.ReLU(),
    max_iter=100,           # Maximum number of iterations.
    eta=0.00005,          # Low learning rate for effective fine-tuning.
    batch_size=128,         # Small batches to avoid memory overload.
    gradient_accumulation_steps=4,  # Increase the effective batch size to (batch_size * steps).
    early_stopping=True,  # Early-stopping
    n_iter_no_change=5)   # convergence criterion

## Baseline data

In [39]:
dynasent_r1 = load_dataset("dynabench/dynasent", "dynabench.dynasent.r1.all")
dynasent_r2 = load_dataset("dynabench/dynasent", "dynabench.dynasent.r2.all")

combined_train = concatenate_datasets(
    [dynasent_r1['train'],
     dynasent_r2['train']])

combined_validation = concatenate_datasets(
    [dynasent_r1['validation'],
     dynasent_r2['validation']])

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


In [None]:
baseline_model = bert_finetune.fit(
    combined_train['sentence'],
    combined_train['gold_label'])

preds = baseline_model.predict(combined_validation['sentence'])
print(classification_report(combined_validation['gold_label'], preds, digits=3))

## Add new training data

### Importance sampling

#### Hashed N-gram

In [None]:
from data_selection import HashedNgramDSIR

raw_datasets = ["dynabench.dynasent.r1.all"]
target_datasets = ["dynabench.dynasent.r2.all"]


def load_dataset_fn(dataset):
  ds = load_dataset("dynabench/dynasent", dataset, split="train")

  return ds

def parse_example_fn(ex):
    return ex["sentence"]

In [41]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mosss\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [49]:
dsir = HashedNgramDSIR(
    raw_datasets=raw_datasets,
    target_datasets=target_datasets,
    cache_dir="dsir_ngram/dsir_cache",
    raw_parse_example_fn=parse_example_fn,
    raw_load_dataset_fn = load_dataset_fn,
    target_parse_example_fn=parse_example_fn,
    target_load_dataset_fn=load_dataset_fn,
    ngrams=2,
    num_buckets= 10000,
    tokenizer= 'word_tokenize',
    separate_targets=True,
    min_example_length = 0,
)

In [42]:
dsir.fit_importance_estimator(num_tokens_to_fit="all")

In [43]:
dsir.compute_importance_weights()

In [44]:
dsir.resample(
    out_dir="dsir_ngram/out_dir",
    num_to_sample=100,
    cache_dir="dsir_ngram/resample_cache",
)

In [68]:
dynasent_r1[0]['sentence']

'Roto-Rooter is always good when you need someone right away.'

In [69]:
nltk.word_tokenize(dynasent_r1[0]['sentence'])

['Roto-Rooter',
 'is',
 'always',
 'good',
 'when',
 'you',
 'need',
 'someone',
 'right',
 'away',
 '.']

In [52]:
dsir.featurizer(dynasent_r1[0]['sentence'])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#### Similarity with sentence embedding
Sentence embedding: 
* Use `all-MiniLM-L6-v2` model which is a small but all purpose model. [Model card](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
* Output dim size: 384
* https://www.sbert.net/docs/sentence_transformer/pretrained_models.html 

In [54]:
from sentence_transformers import SentenceTransformer

In [63]:
model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [64]:
query_embedding = model.encode("How big is London")
passage_embeddings = model.encode([
    "London is known for its financial district",
    "London has 9,787,426 inhabitants at the 2011 census",
    "The United Kingdom is the fourth largest exporter of goods in the world",
    "Is United Kingdom a large country?"
])

similarity = model.similarity(query_embedding, passage_embeddings)
# => tensor([[0.4659, 0.6142, 0.2697]])

similarity

tensor([[0.5517, 0.5627, 0.3164, 0.5740]])

In [111]:
query_embedding.shape

(384,)

In [None]:
def get_setence_embedding(sentence, model):
    """
    Generate the embedding for a given sentence using a pre-trained model.
    Args:
      sentence (str): The input sentence for which the embedding is to be generated.
    Returns:
      numpy.ndarray: The embedding vector for the input sentence.
    """
    
    return model.encode(sentence)

In [136]:
set_embeds = model.encode(dynasent_r2['sentence'])

set_embed_mean = set_embeds.mean(axis=0)

In [137]:
set_embed_mean = torch.from_numpy(set_embed_mean)
set_embed_mean

tensor([-7.1989e-04,  1.5781e-02,  2.2635e-02,  2.1970e-02, -1.8755e-02,
        -1.8372e-02,  3.6520e-03, -2.2772e-02, -4.3399e-03, -2.4890e-02,
         1.7853e-02,  1.2314e-02,  2.0931e-03, -6.6473e-03, -4.3531e-04,
        -4.1037e-02,  6.3474e-02, -4.9652e-02, -5.0404e-03, -2.1696e-02,
        -2.8775e-02,  5.7046e-03,  1.6188e-02,  1.3961e-02, -8.6401e-03,
         1.5888e-02, -5.0427e-03,  1.2517e-02, -4.5056e-03, -2.5078e-02,
        -1.9521e-02,  2.7582e-02, -1.1030e-03, -1.0307e-02, -1.6306e-03,
         5.2132e-03,  3.3032e-02, -2.9796e-02,  1.8148e-02,  5.7711e-03,
         7.7031e-03,  2.6415e-03,  2.3816e-02,  1.8991e-04, -4.8403e-03,
        -1.0763e-02, -5.5890e-03, -2.4115e-02,  4.6051e-02, -3.8644e-03,
         3.3592e-03, -6.7814e-03, -1.5127e-03, -3.7333e-02,  5.2451e-03,
         1.8591e-02, -1.3446e-02, -1.9741e-02,  4.1771e-03,  7.3700e-03,
        -8.9878e-03, -5.6139e-03, -1.6278e-02,  1.8739e-02,  1.2679e-02,
        -2.4410e-02, -3.7272e-02,  4.5294e-03,  1.9

In [139]:
target_set_embeds = torch.from_numpy(set_embeds)
target_set_embeds.shape

torch.Size([13065, 384])

In [144]:
target_set_embed_mean = target_set_embeds.mean(axis=0)
target_set_embed_mean.shape

torch.Size([384])

In [146]:
raw_embeds = model.encode(dynasent_r1['sentence'])

raw_set_embeds = torch.from_numpy(raw_embeds)
raw_set_embeds.shape

torch.Size([80488, 384])

In [147]:
raw_set_similarities = model.similarity(target_set_embed_mean, raw_set_embeds)
raw_set_similarities.shape

torch.Size([1, 80488])

In [154]:
min(raw_set_similarities.squeeze(0))

tensor(-0.1609)

In [155]:
max(raw_set_similarities.squeeze(0))

tensor(0.7423)

In [158]:
for iter, s in enumerate(raw_set_similarities.squeeze(0)):
    if s > 0.6 or s < 0:
      print(f"Similarity: {s.item()}")
      print(f"Sentence: {dynasent_r1[iter]['sentence']}")
      print("\n")

Similarity: 0.6217310428619385
Sentence: We found some fabulous restaurants to explore.


Similarity: 0.655872106552124
Sentence: I have eaten at this place many times over the years and always been happy with the food and the service.


Similarity: 0.6449460983276367
Sentence: Really high end stuff here and I couldn't wait for my meal with some great company.


Similarity: -0.031184349209070206
Sentence: Quand on a reçu nos assiettes, rien d'épatant qui justifie la facture.


Similarity: 0.6536982655525208
Sentence: Food and service were great.


Similarity: 0.6127055287361145
Sentence: It used to be our go to AYCE sushi buffet and I recommended  all the times to my friends.


Similarity: 0.6032223105430603
Sentence: The wife and I ate here looking for a place to sit and not break the bank.


Similarity: 0.6201233863830566
Sentence: I was hesitant to eat here because the shopping center is kind of sketchy.


Similarity: 0.615456223487854
Sentence: The food took a long time to be serve

#### Embedding based importance sampling
Parallel computing and sigmod transformation 

In [114]:
from joblib import Parallel, delayed

def process_item(item):
    # Do some processing on the item
    return model.encode(item)

items = dynasent_r2['sentence'][:10]

# Process items in parallel
results = Parallel(n_jobs=2)(delayed(process_item)(item) for item in items)

print(results)




[array([ 8.79946873e-02, -3.59393470e-02,  9.62829869e-03,  7.78169259e-02,
       -7.18533993e-02,  3.11196297e-02, -7.28027225e-02, -6.48345128e-02,
        2.43592495e-03, -3.32571454e-02, -4.52866405e-02,  2.50906833e-02,
       -7.95948431e-02,  9.76001192e-03,  3.81068178e-02, -1.20317109e-01,
        1.22215971e-01, -3.22167482e-03,  8.59877318e-02, -2.27262657e-02,
        8.78906809e-03, -1.35771800e-02,  3.25045027e-02,  2.00600419e-02,
        2.93880142e-02, -8.11468344e-03,  4.27715946e-03,  3.55039202e-02,
       -2.12435238e-02, -9.47645400e-04, -2.10594498e-02,  2.10465044e-02,
       -8.44213646e-03,  3.88210863e-02,  1.86329149e-03,  5.44616319e-02,
        5.56366742e-02, -5.01381904e-02,  7.98278898e-02, -7.50422701e-02,
        3.79864499e-02,  1.20067243e-02,  1.79776158e-02,  4.96977381e-02,
       -3.29697621e-04, -1.62510965e-02, -8.57561305e-02,  3.33585180e-02,
        8.16994011e-02,  9.34792235e-02, -1.91893186e-02,  3.46838571e-02,
        7.61014782e-03, 

In [127]:
sum(results)/len(results)

array([ 1.30297896e-02,  2.46527232e-02,  2.85544246e-02,  2.25724913e-02,
       -2.02132650e-02, -2.43135672e-02,  5.56408148e-03, -2.05359869e-02,
        8.31974950e-03, -4.79243472e-02,  7.39282602e-03,  5.03015099e-03,
        1.16501618e-02,  5.13426121e-03,  1.81533806e-02, -3.89571860e-02,
        5.84641695e-02, -3.76566574e-02,  1.16396192e-02, -2.32361965e-02,
       -5.02877720e-02,  4.11069021e-03,  2.66564004e-02,  2.93134358e-02,
       -6.24376908e-03,  8.74639861e-03, -1.57859027e-02,  1.17127467e-02,
        1.43881720e-02, -2.56802384e-02, -1.25235151e-02,  5.74199185e-02,
       -1.90570336e-02, -1.17452461e-02,  5.45374397e-03,  1.69938747e-02,
        2.91131735e-02, -2.61995606e-02,  2.74380855e-02, -8.97063129e-03,
        1.19538754e-02,  2.98299603e-02,  2.68071201e-02,  1.27433855e-02,
        1.56280845e-02,  5.55316545e-03,  1.42108591e-03, -2.56914319e-03,
        5.02023585e-02,  1.16505744e-02,  3.65746990e-02,  1.99946333e-02,
        3.26200319e-03, -

In [132]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

probs = sigmoid(set_embeds[0]) / sum(sigmoid(set_embeds[0]))

probs

array([0.00271894, 0.00255763, 0.00261696, 0.00270571, 0.0025109 ,
       0.00264495, 0.00250966, 0.00252003, 0.0026076 , 0.00256112,
       0.00254546, 0.0026371 , 0.00250083, 0.00261714, 0.00265404,
       0.00244794, 0.00276338, 0.00260023, 0.00271633, 0.00257483,
       0.00261587, 0.00258675, 0.00264675, 0.00263055, 0.00264269,
       0.00259386, 0.00261   , 0.00265066, 0.00257676, 0.00260319,
       0.002577  , 0.00263183, 0.00259343, 0.00265497, 0.00260685,
       0.00267533, 0.00267686, 0.00253915, 0.00270832, 0.00250675,
       0.00265389, 0.00262006, 0.00262784, 0.00266913, 0.002604  ,
       0.00258326, 0.00249282, 0.00264786, 0.00271076, 0.00272607,
       0.00257944, 0.00264959, 0.00261434, 0.00247971, 0.00266805,
       0.00270435, 0.00257195, 0.00250521, 0.00263379, 0.00263614,
       0.00254407, 0.00258893, 0.00254779, 0.00263572, 0.00262096,
       0.00249332, 0.00255312, 0.00274701, 0.0026364 , 0.00258873,
       0.00254276, 0.00257683, 0.0026485 , 0.00258216, 0.00250

In [134]:
1/384

0.0026041666666666665