In [9]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
!pip install transformers[torch] datasets



In [11]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

In [12]:
from datasets import concatenate_datasets
from datasets import DatasetDict

def contains_price(example):
    return ('price' in example['text'].lower() and 'high' in example['text'].lower()) or ('price' in example['text'].lower() and 'low' in example['text'].lower())

def contains_pricehighnegative(example):
    return 'price' in example['text'].lower() and 'high' in example['text'].lower() and example['label'] == 0

def contains_pricehighpositive(example):
    return 'price' in example['text'].lower() and 'high' in example['text'].lower() and example['label'] == 1

def contains_pricelownegative(example):
    return 'price' in example['text'].lower() and 'low' in example['text'].lower() and example['label'] == 0

def contains_pricelowpositive(example):
    return 'price' in example['text'].lower() and 'low' in example['text'].lower() and example['label'] == 1

filtered_dataset = dataset.filter(contains_price)

def change_label(example):

    if example['label'] > 2:
        example['label'] = 1
    else:
        example['label'] = 0
    return example

filtered_dataset = filtered_dataset.map(change_label)

pricehighnegative_train_unfilter = filtered_dataset.filter(contains_price)['train'].shuffle(seed=42).select([i for i in list(range(27000))])

pricehighnegative_train_balance = filtered_dataset.filter(contains_pricehighnegative)['train'].shuffle(seed=42).select([i for i in list(range(2900))])
pricehighpositive_train_balance = filtered_dataset.filter(contains_pricehighpositive)['train'].shuffle(seed=42).select([i for i in list(range(2900))])
pricelownegative_train_balance = filtered_dataset.filter(contains_pricelownegative)['train'].shuffle(seed=42).select([i for i in list(range(2900))])
pricelowpositive_train_balance = filtered_dataset.filter(contains_pricelowpositive)['train'].shuffle(seed=42).select([i for i in list(range(2900))])

pricehighnegative_validation_balance = filtered_dataset.filter(contains_pricehighnegative)['train'].shuffle(seed=42).select([i for i in list(range(2900,3100))])
pricehighpositive_validation_balance = filtered_dataset.filter(contains_pricehighpositive)['train'].shuffle(seed=42).select([i for i in list(range(2900,3100))])
pricelownegative_validation_balance = filtered_dataset.filter(contains_pricelownegative)['train'].shuffle(seed=42).select([i for i in list(range(2900,3100))])
pricelowpositive_validation_balance = filtered_dataset.filter(contains_pricelowpositive)['train'].shuffle(seed=42).select([i for i in list(range(2900,3100))])

pricehighnegative_train_imbalance = filtered_dataset.filter(contains_pricehighnegative)['train'].shuffle(seed=42).select([i for i in list(range(2900))])
pricehighpositive_train_imbalance = filtered_dataset.filter(contains_pricehighpositive)['train'].shuffle(seed=42).select([i for i in list(range(500))])
pricelownegative_train_imbalance = filtered_dataset.filter(contains_pricelownegative)['train'].shuffle(seed=42).select([i for i in list(range(500))])
pricelowpositive_train_imbalance = filtered_dataset.filter(contains_pricelowpositive)['train'].shuffle(seed=42).select([i for i in list(range(2900))])

newdataset_train_unfilter = pricehighnegative_train_unfilter
newdataset_train_balance = concatenate_datasets([pricehighnegative_train_balance, pricehighpositive_train_balance, pricelownegative_train_balance, pricelowpositive_train_balance])
newdataset_train_imbalance = concatenate_datasets([pricehighnegative_train_imbalance, pricehighpositive_train_imbalance, pricelownegative_train_imbalance, pricelowpositive_train_imbalance])
newdataset_validation = concatenate_datasets([pricehighnegative_validation_balance, pricehighpositive_validation_balance, pricelownegative_validation_balance, pricelowpositive_validation_balance])

'''
Test tests for four groups
'''
pricehighnegative_test = filtered_dataset.filter(contains_pricehighnegative)['test'].shuffle(seed=42).select([i for i in list(range(230))]) #group 1
pricehighpositive_test = filtered_dataset.filter(contains_pricehighpositive)['test'].shuffle(seed=42).select([i for i in list(range(230))]) #group 2
pricelownegative_test = filtered_dataset.filter(contains_pricelownegative)['test'].shuffle(seed=42).select([i for i in list(range(230))]) #group 3
pricelowpositive_test = filtered_dataset.filter(contains_pricelowpositive)['test'].shuffle(seed=42).select([i for i in list(range(230))]) #group 4

'''
Final Dataset
'''
final_dataset_unfilter = DatasetDict({
    'train': newdataset_train_unfilter,
    'validation': newdataset_validation,
    'test_group_1': pricehighnegative_test,
    'test_group_2': pricehighpositive_test,
    'test_group_3': pricelownegative_test,
    'test_group_4': pricelowpositive_test,
})

final_dataset_balance = DatasetDict({
    'train': newdataset_train_balance,
    'validation': newdataset_validation,
    'test_group_1': pricehighnegative_test,
    'test_group_2': pricehighpositive_test,
    'test_group_3': pricelownegative_test,
    'test_group_4': pricelowpositive_test,
})

final_dataset_imbalance = DatasetDict({
    'train': newdataset_train_imbalance,
    'validation': newdataset_validation,
    'test_group_1': pricehighnegative_test,
    'test_group_2': pricehighpositive_test,
    'test_group_3': pricelownegative_test,
    'test_group_4': pricelowpositive_test,
})
print(final_dataset_unfilter)
print(final_dataset_balance)
print(final_dataset_imbalance)


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 27000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 800
    })
    test_group_1: Dataset({
        features: ['label', 'text'],
        num_rows: 230
    })
    test_group_2: Dataset({
        features: ['label', 'text'],
        num_rows: 230
    })
    test_group_3: Dataset({
        features: ['label', 'text'],
        num_rows: 230
    })
    test_group_4: Dataset({
        features: ['label', 'text'],
        num_rows: 230
    })
})
DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 11600
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 800
    })
    test_group_1: Dataset({
        features: ['label', 'text'],
        num_rows: 230
    })
    test_group_2: Dataset({
        features: ['label', 'text'],
        num_rows: 230
    })
    test_group_3: Dataset({
        feature

#pretrained

In [14]:
from transformers import MobileBertTokenizer, MobileBertModel, MobileBertConfig, AutoModel, MobileBertForSequenceClassification, pipeline

configuration = MobileBertConfig.from_pretrained('google/mobilebert-uncased')
tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
model = MobileBertForSequenceClassification(configuration)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, truncation=True, padding=True, max_length = 512)
results = []
for i in range(1, 5):
    results.append(classifier(final_dataset_unfilter['test_group_'+str(i)]["text"]))

In [15]:
for i in range(len(results)):
  predicted_labels = [int(pred['label'].split('_')[-1]) for pred in results[i]]

  true_labels = final_dataset_unfilter['test_group_'+str(i+1)]['label']
  accuracy = sum([pred == true for pred, true in zip(predicted_labels, true_labels)]) / len(true_labels)
  print(f"Group {i+1} Accuracy: {accuracy:.4f}")

Group 1 Accuracy: 0.1217
Group 2 Accuracy: 0.8783
Group 3 Accuracy: 0.1087
Group 4 Accuracy: 0.8435


#train

In [None]:
!pip install accelerate
!pip install evaluate
!pip install transformers[torch]



##Unfilter

In [None]:
from transformers import MobileBertTokenizer, MobileBertForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer

tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased", max_length = 512)
model = MobileBertForSequenceClassification.from_pretrained("google/mobilebert-uncased")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments("test-trainer")

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length = 512)

tokenized_datasets_unfilter = final_dataset_unfilter.map(tokenize_function, batched=True)

trainer_unfilter = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets_unfilter["train"],
    eval_dataset=tokenized_datasets_unfilter["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer_unfilter.train()

predictions_test_group_1 = trainer_unfilter.predict(tokenized_datasets_unfilter["test_group_1"])
predictions_test_group_2 = trainer_unfilter.predict(tokenized_datasets_unfilter["test_group_2"])
predictions_test_group_3 = trainer_unfilter.predict(tokenized_datasets_unfilter["test_group_3"])
predictions_test_group_4 = trainer_unfilter.predict(tokenized_datasets_unfilter["test_group_4"])

preds_test_group_1 = predictions_test_group_1.predictions.argmax(axis=-1)
preds_test_group_2 = predictions_test_group_2.predictions.argmax(axis=-1)
preds_test_group_3 = predictions_test_group_3.predictions.argmax(axis=-1)
preds_test_group_4 = predictions_test_group_4.predictions.argmax(axis=-1)


Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,6675.5415
1000,2.75
1500,0.874
2000,0.3274
2500,0.3104
3000,0.3125
3500,0.334
4000,12.6178
4500,0.2494
5000,0.2662


Step,Training Loss
500,6675.5415
1000,2.75
1500,0.874
2000,0.3274
2500,0.3104
3000,0.3125
3500,0.334
4000,12.6178
4500,0.2494
5000,0.2662


In [None]:
import evaluate
metric = evaluate.load("glue", "mrpc")

print("Group 1")
metric.compute(predictions=preds_test_group_1, references=predictions_test_group_1.label_ids)

Group 1


{'accuracy': 0.9608695652173913, 'f1': 0.0}

In [None]:
print("Group 2")
metric.compute(predictions=preds_test_group_2, references=predictions_test_group_2.label_ids)

Group 2


{'accuracy': 0.8304347826086956, 'f1': 0.9073634204275535}

In [None]:
print("Group 3")
metric.compute(predictions=preds_test_group_3, references=predictions_test_group_3.label_ids)

Group 3


{'accuracy': 0.9130434782608695, 'f1': 0.0}

In [None]:
print("Group 4")
metric.compute(predictions=preds_test_group_4, references=predictions_test_group_4.label_ids)

Group 4


{'accuracy': 0.8391304347826087, 'f1': 0.9125295508274232}

##Balance

In [None]:
from transformers import MobileBertTokenizer, MobileBertForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer

tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased", max_length = 512)
model = MobileBertForSequenceClassification.from_pretrained("google/mobilebert-uncased")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments("test-trainer")

def tokenize_function(example):
    return tokenizer(example["text"],truncation=True, max_length = 512)

tokenized_datasets_balance = final_dataset_balance.map(tokenize_function, batched=True)

trainer_balance = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets_balance["train"],
    eval_dataset=tokenized_datasets_balance["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer_balance.train()

predictions_test_group_1 = trainer_balance.predict(tokenized_datasets_balance["test_group_1"])
predictions_test_group_2 = trainer_balance.predict(tokenized_datasets_balance["test_group_2"])
predictions_test_group_3 = trainer_balance.predict(tokenized_datasets_balance["test_group_3"])
predictions_test_group_4 = trainer_balance.predict(tokenized_datasets_balance["test_group_4"])

preds_test_group_1 = np.argmax(predictions_test_group_1.predictions, axis=-1)
preds_test_group_2 = np.argmax(predictions_test_group_2.predictions, axis=-1)
preds_test_group_3 = np.argmax(predictions_test_group_3.predictions, axis=-1)
preds_test_group_4 = np.argmax(predictions_test_group_4.predictions, axis=-1)

metric = evaluate.load("glue", "mrpc")
print("Group 1")
metric.compute(predictions=preds_test_group_1, references=predictions_test_group_1.label_ids)


Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/11600 [00:00<?, ? examples/s]

Step,Training Loss
500,87501.536
1000,0.3546
1500,45.6864
2000,0.2826
2500,0.2581
3000,0.2481
3500,0.1713
4000,0.1665


Group 1


{'accuracy': 0.8826086956521739, 'f1': 0.0}

In [None]:
print("Group 2")
metric.compute(predictions=preds_test_group_2, references=predictions_test_group_2.label_ids)

Group 2


{'accuracy': 0.9304347826086956, 'f1': 0.9639639639639639}

In [None]:
print("Group 3")
metric.compute(predictions=preds_test_group_3, references=predictions_test_group_3.label_ids)

Group 3


{'accuracy': 0.8521739130434782, 'f1': 0.0}

In [None]:
print("Group 4")
metric.compute(predictions=preds_test_group_4, references=predictions_test_group_4.label_ids)

Group 4


{'accuracy': 0.9217391304347826, 'f1': 0.9592760180995475}

#Imbalance

In [None]:
from transformers import MobileBertTokenizer, MobileBertForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer

tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased", max_length = 512)
model = MobileBertForSequenceClassification.from_pretrained("google/mobilebert-uncased")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments("test-trainer")

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length =512)

tokenized_datasets_imbalance = final_dataset_imbalance.map(tokenize_function, batched=True)

trainer_imbalance = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets_imbalance["train"],
    eval_dataset=tokenized_datasets_imbalance["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer_imbalance.train()

predictions_test_group_1 = trainer_imbalance.predict(tokenized_datasets_imbalance["test_group_1"])
predictions_test_group_2 = trainer_imbalance.predict(tokenized_datasets_imbalance["test_group_2"])
predictions_test_group_3 = trainer_imbalance.predict(tokenized_datasets_imbalance["test_group_3"])
predictions_test_group_4 = trainer_imbalance.predict(tokenized_datasets_imbalance["test_group_4"])

preds_test_group_1 = predictions_test_group_1.predictions.argmax(axis=-1)
preds_test_group_2 = predictions_test_group_2.predictions.argmax(axis=-1)
preds_test_group_3 = predictions_test_group_3.predictions.argmax(axis=-1)
preds_test_group_4 = predictions_test_group_4.predictions.argmax(axis=-1)




Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6800 [00:00<?, ? examples/s]

Step,Training Loss
500,35954.98
1000,0.36
1500,0.2714
2000,0.2006
2500,0.1568


In [None]:
metric = evaluate.load("glue", "mrpc")

print("Group 1")
metric.compute(predictions=preds_test_group_1, references=predictions_test_group_1.label_ids)


Group 1


{'accuracy': 0.9260869565217391, 'f1': 0.0}

In [None]:
print("Group 2")
metric.compute(predictions=preds_test_group_2, references=predictions_test_group_2.label_ids)

Group 2


{'accuracy': 0.8565217391304348, 'f1': 0.9227166276346604}

In [None]:
print("Group 3")
metric.compute(predictions=preds_test_group_3, references=predictions_test_group_3.label_ids)

Group 3


{'accuracy': 0.8130434782608695, 'f1': 0.0}

In [None]:
print("Group 4")
metric.compute(predictions=preds_test_group_4, references=predictions_test_group_4.label_ids)

Group 4


{'accuracy': 0.9217391304347826, 'f1': 0.9592760180995475}