In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
!pip install transformers[torch] datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collect

In [3]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

Downloading builder script:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.55k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/196M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
from datasets import concatenate_datasets
from datasets import DatasetDict

def contains_price(example):
    return ('price' in example['text'].lower() and 'high' in example['text'].lower()) or ('price' in example['text'].lower() and 'low' in example['text'].lower())

def contains_pricehighnegative(example):
    return 'price' in example['text'].lower() and 'high' in example['text'].lower() and example['label'] == 0

def contains_pricehighpositive(example):
    return 'price' in example['text'].lower() and 'high' in example['text'].lower() and example['label'] == 1

def contains_pricelownegative(example):
    return 'price' in example['text'].lower() and 'low' in example['text'].lower() and example['label'] == 0

def contains_pricelowpositive(example):
    return 'price' in example['text'].lower() and 'low' in example['text'].lower() and example['label'] == 1

filtered_dataset = dataset.filter(contains_price)

def change_label(example):
    if example['label'] > 2:
        example['label'] = 1
    else:
        example['label'] = 0
    return example

filtered_dataset = filtered_dataset.map(change_label)

pricehighnegative_train_unfilter = filtered_dataset.filter(contains_price)['train'].shuffle(seed=42).select([i for i in list(range(27000))])

pricehighnegative_train_balance = filtered_dataset.filter(contains_pricehighnegative)['train'].shuffle(seed=42).select([i for i in list(range(2900))])
pricehighpositive_train_balance = filtered_dataset.filter(contains_pricehighpositive)['train'].shuffle(seed=42).select([i for i in list(range(2900))])
pricelownegative_train_balance = filtered_dataset.filter(contains_pricelownegative)['train'].shuffle(seed=42).select([i for i in list(range(2900))])
pricelowpositive_train_balance = filtered_dataset.filter(contains_pricelowpositive)['train'].shuffle(seed=42).select([i for i in list(range(2900))])

pricehighnegative_validation_balance = filtered_dataset.filter(contains_pricehighnegative)['train'].shuffle(seed=42).select([i for i in list(range(2900,3100))])
pricehighpositive_validation_balance = filtered_dataset.filter(contains_pricehighpositive)['train'].shuffle(seed=42).select([i for i in list(range(2900,3100))])
pricelownegative_validation_balance = filtered_dataset.filter(contains_pricelownegative)['train'].shuffle(seed=42).select([i for i in list(range(2900,3100))])
pricelowpositive_validation_balance = filtered_dataset.filter(contains_pricelowpositive)['train'].shuffle(seed=42).select([i for i in list(range(2900,3100))])

pricehighnegative_train_imbalance = filtered_dataset.filter(contains_pricehighnegative)['train'].shuffle(seed=42).select([i for i in list(range(2900))])
pricehighpositive_train_imbalance = filtered_dataset.filter(contains_pricehighpositive)['train'].shuffle(seed=42).select([i for i in list(range(500))])
pricelownegative_train_imbalance = filtered_dataset.filter(contains_pricelownegative)['train'].shuffle(seed=42).select([i for i in list(range(500))])
pricelowpositive_train_imbalance = filtered_dataset.filter(contains_pricelowpositive)['train'].shuffle(seed=42).select([i for i in list(range(2900))])

newdataset_train_unfilter = pricehighnegative_train_unfilter
newdataset_train_balance = concatenate_datasets([pricehighnegative_train_balance, pricehighpositive_train_balance, pricelownegative_train_balance, pricelowpositive_train_balance])
newdataset_train_imbalance = concatenate_datasets([pricehighnegative_train_imbalance, pricehighpositive_train_imbalance, pricelownegative_train_imbalance, pricelowpositive_train_imbalance])
newdataset_validation = concatenate_datasets([pricehighnegative_validation_balance, pricehighpositive_validation_balance, pricelownegative_validation_balance, pricelowpositive_validation_balance])

'''
Test tests for four groups
'''
pricehighnegative_test = filtered_dataset.filter(contains_pricehighnegative)['test'].shuffle(seed=42).select([i for i in list(range(230))]) #group 1
pricehighpositive_test = filtered_dataset.filter(contains_pricehighpositive)['test'].shuffle(seed=42).select([i for i in list(range(230))]) #group 2
pricelownegative_test = filtered_dataset.filter(contains_pricelownegative)['test'].shuffle(seed=42).select([i for i in list(range(230))]) #group 3
pricelowpositive_test = filtered_dataset.filter(contains_pricelowpositive)['test'].shuffle(seed=42).select([i for i in list(range(230))]) #group 4

'''
Final Dataset
'''
final_dataset_unfilter = DatasetDict({
    'train': newdataset_train_unfilter,
    'validation': newdataset_validation,
    'test_group_1': pricehighnegative_test,
    'test_group_2': pricehighpositive_test,
    'test_group_3': pricelownegative_test,
    'test_group_4': pricelowpositive_test,
})

final_dataset_balance = DatasetDict({
    'train': newdataset_train_balance,
    'validation': newdataset_validation,
    'test_group_1': pricehighnegative_test,
    'test_group_2': pricehighpositive_test,
    'test_group_3': pricelownegative_test,
    'test_group_4': pricelowpositive_test,
})

final_dataset_imbalance = DatasetDict({
    'train': newdataset_train_imbalance,
    'validation': newdataset_validation,
    'test_group_1': pricehighnegative_test,
    'test_group_2': pricehighpositive_test,
    'test_group_3': pricelownegative_test,
    'test_group_4': pricelowpositive_test,
})
print(final_dataset_unfilter)
print(final_dataset_balance)
print(final_dataset_imbalance)


Filter:   0%|          | 0/650000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/40779 [00:00<?, ? examples/s]

Map:   0%|          | 0/3089 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40779 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3089 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40779 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3089 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40779 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3089 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40779 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3089 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40779 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3089 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 27000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 800
    })
    test_group_1: Dataset({
        features: ['label', 'text'],
        num_rows: 230
    })
    test_group_2: Dataset({
        features: ['label', 'text'],
        num_rows: 230
    })
    test_group_3: Dataset({
        features: ['label', 'text'],
        num_rows: 230
    })
    test_group_4: Dataset({
        features: ['label', 'text'],
        num_rows: 230
    })
})
DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 11600
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 800
    })
    test_group_1: Dataset({
        features: ['label', 'text'],
        num_rows: 230
    })
    test_group_2: Dataset({
        features: ['label', 'text'],
        num_rows: 230
    })
    test_group_3: Dataset({
        feature

#pretrained

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')


classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, truncation=True, padding=True)
results = []
for i in range(1, 5):
  results.append(classifier(final_dataset_unfilter['test_group_'+str(i)]["text"]))


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
for i in range(len(results)):
  predicted_labels = [int(pred['label'].split('_')[-1]) for pred in results[i]]

  true_labels = final_dataset_unfilter['test_group_'+str(i+1)]['label']
  accuracy = sum([pred == true for pred, true in zip(predicted_labels, true_labels)]) / len(true_labels)
  print(f"Group {i+1} Accuracy: {accuracy:.4f}")

Group 1 Accuracy: 0.1348
Group 2 Accuracy: 0.8870
Group 3 Accuracy: 0.0739
Group 4 Accuracy: 0.9000


#train

In [5]:
!pip install accelerate
!pip install evaluate
!pip install transformers[torch]

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


##Unfilter

In [10]:
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments("test-trainer")

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets_unfilter = final_dataset_unfilter.map(tokenize_function, batched=True)

trainer_unfilter = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets_unfilter["train"],
    eval_dataset=tokenized_datasets_unfilter["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer_unfilter.train()

predictions_test_group_1 = trainer_unfilter.predict(tokenized_datasets_unfilter["test_group_1"])
predictions_test_group_2 = trainer_unfilter.predict(tokenized_datasets_unfilter["test_group_2"])
predictions_test_group_3 = trainer_unfilter.predict(tokenized_datasets_unfilter["test_group_3"])
predictions_test_group_4 = trainer_unfilter.predict(tokenized_datasets_unfilter["test_group_4"])

preds_test_group_1 = predictions_test_group_1.predictions.argmax(axis=-1)
preds_test_group_2 = predictions_test_group_2.predictions.argmax(axis=-1)
preds_test_group_3 = predictions_test_group_3.predictions.argmax(axis=-1)
preds_test_group_4 = predictions_test_group_4.predictions.argmax(axis=-1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/27000 [00:00<?, ? examples/s]

Step,Training Loss
500,0.4541
1000,0.3806
1500,0.39
2000,0.3678
2500,0.357
3000,0.3611
3500,0.3234
4000,0.2853
4500,0.264
5000,0.2989


In [None]:
import evaluate
metric = evaluate.load("glue", "mrpc")

print("Group 1")
metric.compute(predictions=preds_test_group_1, references=predictions_test_group_1.label_ids)

Group 1


{'accuracy': 0.9043478260869565, 'f1': 0.0}

In [None]:
print("Group 2")
metric.compute(predictions=preds_test_group_2, references=predictions_test_group_2.label_ids)

Group 2


{'accuracy': 0.808695652173913, 'f1': 0.8942307692307693}

In [None]:
print("Group 3")
metric.compute(predictions=preds_test_group_3, references=predictions_test_group_3.label_ids)

Group 3


{'accuracy': 0.7782608695652173, 'f1': 0.0}

In [None]:
print("Group 4")
metric.compute(predictions=preds_test_group_4, references=predictions_test_group_4.label_ids)

Group 4


{'accuracy': 0.9260869565217391, 'f1': 0.9616252821670429}

##Balance

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments("test-trainer")

def tokenize_function(example):
    return tokenizer(example["text"],truncation=True)

tokenized_datasets_balance = final_dataset_balance.map(tokenize_function, batched=True)

trainer_balance = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets_balance["train"],
    eval_dataset=tokenized_datasets_balance["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer_balance.train()

predictions_test_group_1 = trainer_balance.predict(tokenized_datasets_balance["test_group_1"])
predictions_test_group_2 = trainer_balance.predict(tokenized_datasets_balance["test_group_2"])
predictions_test_group_3 = trainer_balance.predict(tokenized_datasets_balance["test_group_3"])
predictions_test_group_4 = trainer_balance.predict(tokenized_datasets_balance["test_group_4"])

preds_test_group_1 = np.argmax(predictions_test_group_1.predictions, axis=-1)
preds_test_group_2 = np.argmax(predictions_test_group_2.predictions, axis=-1)
preds_test_group_3 = np.argmax(predictions_test_group_3.predictions, axis=-1)
preds_test_group_4 = np.argmax(predictions_test_group_4.predictions, axis=-1)




tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/11600 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Step,Training Loss
500,0.4931
1000,0.3825
1500,0.3613
2000,0.27


Step,Training Loss
500,0.4931
1000,0.3825
1500,0.3613
2000,0.27
2500,0.2653
3000,0.2361
3500,0.1093
4000,0.1201


In [8]:
import evaluate
metric = evaluate.load("glue", "mrpc")
print("group 1")
metric.compute(predictions=preds_test_group_1, references=predictions_test_group_1.label_ids)

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

group 1


{'accuracy': 0.8826086956521739, 'f1': 0.0}

In [9]:
print("Group 2")
metric.compute(predictions=preds_test_group_2, references=predictions_test_group_2.label_ids)

Group 2


{'accuracy': 0.8956521739130435, 'f1': 0.944954128440367}

In [11]:
print("Group 3")
metric.compute(predictions=preds_test_group_3, references=predictions_test_group_3.label_ids)

Group 3


{'accuracy': 0.8608695652173913, 'f1': 0.0}

In [12]:
print("Group 4")
metric.compute(predictions=preds_test_group_4, references=predictions_test_group_4.label_ids)

Group 4


{'accuracy': 0.8956521739130435, 'f1': 0.944954128440367}

#Imbalance

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments("test-trainer")

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets_imbalance = final_dataset_imbalance.map(tokenize_function, batched=True)

trainer_imbalance = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets_imbalance["train"],
    eval_dataset=tokenized_datasets_imbalance["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer_imbalance.train()

predictions_test_group_1 = trainer_imbalance.predict(tokenized_datasets_imbalance["test_group_1"])
predictions_test_group_2 = trainer_imbalance.predict(tokenized_datasets_imbalance["test_group_2"])
predictions_test_group_3 = trainer_imbalance.predict(tokenized_datasets_imbalance["test_group_3"])
predictions_test_group_4 = trainer_imbalance.predict(tokenized_datasets_imbalance["test_group_4"])

preds_test_group_1 = predictions_test_group_1.predictions.argmax(axis=-1)
preds_test_group_2 = predictions_test_group_2.predictions.argmax(axis=-1)
preds_test_group_3 = predictions_test_group_3.predictions.argmax(axis=-1)
preds_test_group_4 = predictions_test_group_4.predictions.argmax(axis=-1)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6800 [00:00<?, ? examples/s]

Step,Training Loss


Step,Training Loss
500,0.5082
1000,0.3657
1500,0.2615
2000,0.1845
2500,0.1385


In [None]:
metric = evaluate.load("glue", "mrpc")
print("group 1")
metric.compute(predictions=preds_test_group_1, references=predictions_test_group_1.label_ids)

group 1


{'accuracy': 0.9043478260869565, 'f1': 0.0}

In [None]:
print("Group 2")
metric.compute(predictions=preds_test_group_2, references=predictions_test_group_2.label_ids)

Group 2


{'accuracy': 0.808695652173913, 'f1': 0.8942307692307693}

In [None]:
print("Group 3")
metric.compute(predictions=preds_test_group_3, references=predictions_test_group_3.label_ids)

Group 3


{'accuracy': 0.7782608695652173, 'f1': 0.0}

In [None]:
print("Group 4")
metric.compute(predictions=preds_test_group_4, references=predictions_test_group_4.label_ids)

Group 4


{'accuracy': 0.9260869565217391, 'f1': 0.9616252821670429}