
# 출처:
```
@misc{hartmann2022emotionenglish,
  author={Hartmann, Jochen},
  title={Emotion English DistilRoBERTa-base},
  year={2022},
  howpublished = {\url{https://huggingface.co/j-hartmann/emotion-english-distilroberta-base/}},
}
```



# Run Emotion-English-DistilRoBERTa-base on multiple text documents

In [None]:
# install the transformers library
!pip install transformers

In [None]:
# import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [None]:
# 실행 후 런타임 다시 시작
!pip install transformers[torch]
!pip install accelerate -U

In [None]:
# load tokenizer and model, create trainer
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

# Connect to Google Drive and select file

In [None]:
# import file stored on Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 디렉토리 변경 후 확인
!pwd
%cd /content/drive/MyDrive/Colab Notebooks/p23_BML
!pwd

/content
/content/drive/.shortcut-targets-by-id/1KwfkXpOMj0nJ1Xgddg3hRS9xKcwgXuV2/p23_BML
/content/drive/.shortcut-targets-by-id/1KwfkXpOMj0nJ1Xgddg3hRS9xKcwgXuV2/p23_BML


In [None]:
# specify your filename
file_name = "final/data/lyrics_1519.csv"  # note: you can right-click on your file and copy-paste the path to it here
text_column = "lyrics"  # select the column in your csv that contains the text to be classified

# read in csv
df_pred = pd.read_csv(file_name)
pred_texts = df_pred[text_column].dropna().astype('str').tolist()

# Classify texts with model

In [None]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [None]:
# Run predictions
predictions = trainer.predict(pred_dataset)

In [None]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
# scores raw
temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

In [None]:
# work in progress
# container
anger = []
disgust = []
fear = []
joy = []
neutral = []
sadness = []
surprise = []

# extract scores (as many entries as exist in pred_texts)
for i in range(len(pred_texts)):
  anger.append(temp[i][0])
  disgust.append(temp[i][1])
  fear.append(temp[i][2])
  joy.append(temp[i][3])
  neutral.append(temp[i][4])
  sadness.append(temp[i][5])
  surprise.append(temp[i][6])

In [None]:
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores,  anger, disgust, fear, joy, neutral, sadness, surprise)), columns=['text','pred','label','score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
df.head()

Unnamed: 0,text,pred,label,score,anger,disgust,fear,joy,neutral,sadness,surprise
0,Ayy,4,neutral,0.607754,0.035741,0.226343,0.020135,0.040036,0.607754,0.041423,0.028568
1,"I'm tryna put you in the worst mood, ah",1,disgust,0.861025,0.05104,0.861025,0.025521,0.004185,0.019736,0.0356,0.002893
2,"P1 cleaner than your church shoes, ah",4,neutral,0.763157,0.021455,0.101304,0.040185,0.007978,0.763157,0.030372,0.035549
3,"Milli point two just to hurt you, ah",2,fear,0.283817,0.083188,0.042239,0.283817,0.009367,0.224749,0.278465,0.078177
4,"All red Lamb' just to tease you, ah",4,neutral,0.481796,0.077467,0.023043,0.073048,0.229155,0.481796,0.02724,0.088251


In [None]:
df.shape

(30506, 11)

# Export results

In [None]:
# save results to csv
YOUR_FILENAME = "final/data/EMOTIONS_1519.csv"  # name your output file
df.to_csv(YOUR_FILENAME, index=False)

In [None]:
df

Unnamed: 0,text,pred,label,score,anger,disgust,fear,joy,neutral,sadness,surprise
0,Ayy,4,neutral,0.607754,0.035741,0.226343,0.020135,0.040036,0.607754,0.041423,0.028568
1,"I'm tryna put you in the worst mood, ah",1,disgust,0.861025,0.051040,0.861025,0.025521,0.004185,0.019736,0.035600,0.002893
2,"P1 cleaner than your church shoes, ah",4,neutral,0.763157,0.021455,0.101304,0.040185,0.007978,0.763157,0.030372,0.035549
3,"Milli point two just to hurt you, ah",2,fear,0.283817,0.083188,0.042239,0.283817,0.009367,0.224749,0.278465,0.078177
4,"All red Lamb' just to tease you, ah",4,neutral,0.481796,0.077467,0.023043,0.073048,0.229155,0.481796,0.027240,0.088251
...,...,...,...,...,...,...,...,...,...,...,...
30501,Drunk as fuck,1,disgust,0.768785,0.205887,0.768785,0.005426,0.000872,0.007323,0.009832,0.001875
30502,"Bitch, you dumb as fuck",0,anger,0.767692,0.767692,0.202131,0.005570,0.001584,0.009185,0.010844,0.002994
30503,"Running up the bucks (Damn, ayy)",1,disgust,0.531523,0.176005,0.531523,0.010102,0.014386,0.039877,0.091943,0.136163
30504,Leave me alone,5,sadness,0.561761,0.036431,0.022527,0.017346,0.007591,0.350916,0.561761,0.003429


In [None]:
EMO_df = pd.concat([df_pred, df], axis=1).drop(columns=['text'], axis=0)
EMO_df

Unnamed: 0,lyrics,track_name,artist_name,year,pred,label,score,anger,disgust,fear,joy,neutral,sadness,surprise
0,Ayy,Starboy,The Weeknd,2016,4,neutral,0.607754,0.035741,0.226343,0.020135,0.040036,0.607754,0.041423,0.028568
1,"I'm tryna put you in the worst mood, ah",Starboy,The Weeknd,2016,1,disgust,0.861025,0.051040,0.861025,0.025521,0.004185,0.019736,0.035600,0.002893
2,"P1 cleaner than your church shoes, ah",Starboy,The Weeknd,2016,4,neutral,0.763157,0.021455,0.101304,0.040185,0.007978,0.763157,0.030372,0.035549
3,"Milli point two just to hurt you, ah",Starboy,The Weeknd,2016,2,fear,0.283817,0.083188,0.042239,0.283817,0.009367,0.224749,0.278465,0.078177
4,"All red Lamb' just to tease you, ah",Starboy,The Weeknd,2016,4,neutral,0.481796,0.077467,0.023043,0.073048,0.229155,0.481796,0.027240,0.088251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30501,Drunk as fuck,Leave Me Alone,Flipp Dinero,2019,1,disgust,0.768785,0.205887,0.768785,0.005426,0.000872,0.007323,0.009832,0.001875
30502,"Bitch, you dumb as fuck",Leave Me Alone,Flipp Dinero,2019,0,anger,0.767692,0.767692,0.202131,0.005570,0.001584,0.009185,0.010844,0.002994
30503,"Running up the bucks (Damn, ayy)",Leave Me Alone,Flipp Dinero,2019,1,disgust,0.531523,0.176005,0.531523,0.010102,0.014386,0.039877,0.091943,0.136163
30504,Leave me alone,Leave Me Alone,Flipp Dinero,2019,5,sadness,0.561761,0.036431,0.022527,0.017346,0.007591,0.350916,0.561761,0.003429


In [None]:
EMO_df.to_csv("final/data/EMOTIONS_1519.csv", index=False)