In [None]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu

In [None]:
!pip install pandas seaborn xlsxwriter openpyxl transformers sentencepiece alive-progress

## Import libraries
We are using a new library called scikit-learn, originally created and released for free by researchers at the French national laboratory INRIA.

In [None]:
import numpy as np
import pandas as pd
import torch


You can find new models at: https://huggingface.co/models?pipeline_tag=text-classification

Click the "Use in Transformers" button near the top-right, and copy-paste the top box into the cell below:

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

In [None]:
model.config.output_scores = True

In [None]:
def score_prob(input_text, model):
    with torch.no_grad():
        inputs = tokenizer(input_text, return_tensors="pt")
        output = model(**inputs)
        probs = output.logits.softmax(1)
        return float(probs[0][0])
    

In [None]:
score_prob("I'm really sad", model)

In [None]:
score_prob("I'm really happy", model)

In [None]:
data = pd.read_excel("COMM106E_happysad.xlsx")
data

# For each sentence in our original dataset, have the model score it

This takes 5-15 seconds per score, so it takes a long time! We can use `alive_progress` library by 

In [None]:
from alive_progress import alive_bar

In [None]:
results_list = []

with alive_bar(len(data), force_tty=True) as bar:
    for index_num, row in data.iterrows():
        prob_negative = score_prob(row['input'],model)
        result = {'input' : row['input'],
                  'true_label' : row['output'],
                  'score' : prob_negative}

        results_list.append(result)

        print(result)
        bar()


# Auditing for country bias

In [None]:
countries = pd.read_csv("countries.csv")
countries

In [None]:
results_list = []

with alive_bar(len(countries), force_tty=True) as bar:

    for country_name in countries['country']:

        sample_text = "Yay! I won a vacation to " + country_name + "!"
        probability = score_prob(sample_text, model)

        result = {'country':country_name,
                  'prediction':probability}
        print(result)
        results_list.append(result)
        bar()

In [None]:
data_audit = pd.DataFrame(results_list)
data_audit

In [None]:
data_audit.sort_values('prediction')