In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pip install emoji

Collecting emoji
  Downloading emoji-2.11.1-py2.py3-none-any.whl (433 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/433.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.1/433.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.11.1


# Imports

In [3]:
import numpy as np
import emoji
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Global Variables

In [5]:
training_data_dir = "/content/drive/MyDrive/Year3Project/Data/EmojifyData/train.txt"
test_data_dir = "/content/drive/MyDrive/Year3Project/Data/EmojifyData/test.txt"
TRAIN_SIZE = 50000
TEST_SIZE = 5000
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 1
LEARNING_RATE = 1e-05

# Get Mapping

In [6]:
def simple_tweet_reader(dir):
  text = []
  labels = []
  for line in open(dir):
    if len(line) > 1:
      words = line.split()
      text.append(''.join(words[0:-1]))
      labels.append(words[-1])
      if words[0] == "<STOP>":
        yield np.column_stack((text, labels))
        text = []
        labels = []

In [7]:
def get_class_mapping(tweets):
  mapping = {}
  count = 0
  for tweet in tweets:
    for label in tweet[:,1]:
      if label != 'O':
        if label not in mapping:
          mapping[label] = count
          count = count + 1

  return mapping

Following code is ran once and values hard coded

In [8]:
"""

tweets = simple_tweet_reader(training_data_dir)

mapping = get_class_mapping(tweets)

mapping

"""

'\n\ntweets = simple_tweet_reader(training_data_dir)\n\nmapping = get_class_mapping(tweets)\n\nmapping\n\n'

In [9]:
mapping = {':face_with_tears_of_joy:': 0,
 ':weary_face:': 1,
 ':purple_heart:': 2,
 ':party_popper:': 3,
 ':speaking_head:': 4,
 ':sparkles:': 5,
 ':clapping_hands:': 6,
 ':loudly_crying_face:': 7,
 ':smiling_face_with_heart-eyes:': 8,
 ':person_shrugging:': 9,
 ':female_sign:': 10,
 ':fire:': 11,
 ':person_facepalming:': 12,
 ':male_sign:': 13,
 ':red_heart:': 14,
 ':hundred_points:': 15,
 ':raising_hands:': 16,
 ':trophy:': 17,
 ':beaming_face_with_smiling_eyes:': 18,
 ':backhand_index_pointing_down:': 19,
 ':two_hearts:': 20,
 ':heart_suit:': 21,
 ':skull:': 22,
 ':thumbs_up:': 23,
 ':folded_hands:': 24,
 ':flexed_biceps:': 25,
 ':face_blowing_a_kiss:': 26,
 ':smiling_face:': 27,
 ':face_with_rolling_eyes:': 28,
 ':crying_face:': 29,
 ':police_car_light:': 30,
 ':OK_hand:': 31,
 ':blue_heart:': 32,
 ':thinking_face:': 33,
 ':winking_face:': 34,
 ':flushed_face:': 35,
 ':white_heavy_check_mark:': 36,
 ':smiling_face_with_sunglasses:': 37,
 ':double_exclamation_mark:': 38,
 ':smiling_face_with_smiling_eyes:': 39,
 ':backhand_index_pointing_right:': 40,
 ':collision:': 41,
 ':rolling_on_the_floor_laughing:': 42,
 ':yellow_heart:': 43,
 ':glowing_star:': 44,
 ':right_arrow:': 45,
 ':heavy_check_mark:': 46,
 ':eyes:': 47,
 ':sparkling_heart:': 48}

# Get Data

In [10]:
class Tweet():
  def __init__(self, text, labels):
    self.text = text
    self.labels = labels

  def duplicate_tweet(self):
    labels = set(self.labels)
    labels.discard(-1)
    phrases = ' '.join(self.text)
    phrases = np.repeat(phrases, len(labels))
    return np.column_stack((phrases, np.array(list(labels))))

  def split_on_emoji(self):
    phrases = []
    labels = []
    text = []
    for word, label in zip(self.text, self.labels):
      text.append(word)
      if label != -1:
        if len(text) > 2:
          phrases.append(' '.join(text))
          labels.append(label)
        text = []
    return np.column_stack((phrases, labels))

  def test_tweet(self):
    labels = set(self.labels)
    labels.discard(-1)
    return (' '.join(self.text), np.array(list(labels)))


In [11]:
def tweet_reader(dir, mapping):
  text = []
  labels = []
  for line in open(dir):
    if len(line) > 1:
      words = line.split()
      if words[0] == "<START>" or words[0] == "<STOP>":
        text.append('')
      else:
        text.append(''.join(words[0:-1]))
      label = words[-1]
      if label == 'O':
        label = -1;
      else:
        label = mapping[label]
      labels.append(label)
      if words[0] == "<STOP>":
        yield Tweet(np.array(text), np.array(labels))
        text = []
        labels = []

def duplicate_tweet_reader(tweets):
  for tweet in tweets[0]:
    vals = tweet.duplicate_tweet()
    for val in vals:
      yield val

def split_on_emoji_reader(tweets):
  for tweet in tweets[0]:
    vals = tweet.split_on_emoji()
    for val in vals:
      yield val

def test_tweet_reader(tweets):
  for tweet in tweets[0]:
    val = tweet.test_tweet()
    yield val

https://www.kaggle.com/datasets/rexhaif/emojifydata-en?resource=download


In [12]:
def summary(tweets):
  labels_counts = {}
  count = 0
  for tweet in tweets:
    count = count + 1

    for label in tweet.labels:
      if label != -1:
        labels_counts[label] = labels_counts.get(label, 0) + 1

  print(f"Number of Tweets: {count}")
  print(f"Number of Emojis (classes) {len(labels_counts)}")
  print(f"Emojis: {labels_counts}")


In [13]:
tweets = tweet_reader(training_data_dir, mapping)

summary(tweets)

Number of Tweets: 6567625
Number of Emojis (classes) 49
Emojis: {0: 1289713, 1: 172054, 2: 99114, 3: 108384, 4: 74819, 5: 172604, 6: 232379, 7: 505359, 8: 369943, 9: 167083, 10: 295265, 11: 436422, 12: 145759, 13: 218984, 14: 736664, 15: 137218, 16: 168722, 17: 75797, 18: 68810, 19: 81310, 20: 169041, 21: 107812, 22: 91693, 23: 107742, 24: 224002, 25: 102611, 26: 97316, 27: 71107, 28: 105933, 29: 59575, 30: 149174, 31: 79765, 32: 89699, 33: 135281, 34: 78206, 35: 62435, 36: 107182, 37: 76249, 38: 125194, 39: 149645, 40: 182706, 41: 95370, 42: 116567, 43: 57581, 44: 69093, 45: 73906, 46: 92821, 47: 121906, 48: 84255}


In [14]:
tweets = tweet_reader(test_data_dir, mapping)

summary(tweets)

Number of Tweets: 2052383
Number of Emojis (classes) 49
Emojis: {48: 26237, 0: 401715, 24: 70686, 4: 23137, 21: 33752, 40: 57364, 31: 24852, 12: 45441, 13: 68988, 7: 157650, 45: 23192, 1: 53689, 9: 52302, 10: 92027, 34: 23922, 29: 18655, 20: 52439, 5: 54333, 35: 19387, 2: 31141, 32: 28187, 46: 28768, 16: 53025, 23: 33976, 6: 73743, 14: 232276, 11: 136725, 3: 33968, 25: 31760, 8: 115349, 43: 17910, 33: 42433, 22: 28935, 30: 46254, 19: 25428, 38: 38265, 26: 30620, 39: 46719, 36: 33936, 44: 21227, 28: 33088, 47: 38392, 27: 21921, 15: 42740, 42: 36444, 18: 21442, 41: 30121, 37: 24138, 17: 23744}


In [None]:
class emojiDataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len=256):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.text = dataframe["Tweet"]
    self.labels = dataframe["Label"]
    self.max_length = max_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    inputs = self.tokenizer.encode_plus(
            self.text[index],
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    return {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'targets': torch.tensor(self.labels[index], dtype=torch.float)
      }

In [None]:
class testDataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len=256):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.text = dataframe["Tweet"]
    self.labels = dataframe["Label"]
    self.max_length = max_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    inputs = self.tokenizer.encode_plus(
            self.text[index],
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    return {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'targets': torch.tensor(self.labels[index], dtype=torch.float)
      }

In [None]:
tweets = tweet_reader(training_data_dir, mapping)
train_data = pd.DataFrame(data=tweets)
summary(train_data[0])
train_data = train_data.sample(TRAIN_SIZE)
summary(train_data[0])

tweets = tweet_reader(test_data_dir, mapping)
test_data = pd.DataFrame(data=tweets)
summary(test_data[0])
test_data = test_data.sample(TEST_SIZE)
summary(test_data[0])

Number of Tweets: 6567625
Number of Emojis (classes) 49
Emojis: {0: 1289713, 1: 172054, 2: 99114, 3: 108384, 4: 74819, 5: 172604, 6: 232379, 7: 505359, 8: 369943, 9: 167083, 10: 295265, 11: 436422, 12: 145759, 13: 218984, 14: 736664, 15: 137218, 16: 168722, 17: 75797, 18: 68810, 19: 81310, 20: 169041, 21: 107812, 22: 91693, 23: 107742, 24: 224002, 25: 102611, 26: 97316, 27: 71107, 28: 105933, 29: 59575, 30: 149174, 31: 79765, 32: 89699, 33: 135281, 34: 78206, 35: 62435, 36: 107182, 37: 76249, 38: 125194, 39: 149645, 40: 182706, 41: 95370, 42: 116567, 43: 57581, 44: 69093, 45: 73906, 46: 92821, 47: 121906, 48: 84255}
Number of Tweets: 50000
Number of Emojis (classes) 49
Emojis: {42: 858, 11: 3276, 24: 1592, 13: 1683, 8: 2827, 33: 1039, 0: 9926, 37: 565, 23: 824, 25: 749, 7: 3748, 40: 1365, 35: 490, 16: 1331, 12: 1099, 41: 671, 28: 835, 39: 1221, 48: 610, 29: 451, 2: 746, 26: 726, 30: 1105, 27: 523, 19: 653, 14: 5404, 44: 555, 45: 599, 18: 536, 34: 621, 5: 1377, 22: 724, 1: 1293, 31: 535

In [None]:
tweets = split_on_emoji_reader(train_data)

train = pd.DataFrame(data=tweets, columns=("Tweet", "Label"))
train["Label"] = train["Label"].astype('int32')

tweets = test_tweet_reader(test_data)

test = pd.DataFrame(data=tweets, columns=("Tweet", "Label"))

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

split_train_set = emojiDataset(train, tokenizer)
split_test_set = emojiDataset(test, tokenizer)

In [None]:
tweets = duplicate_tweet_reader(train_data)

train = pd.DataFrame(data=tweets, columns=("Tweet", "Label"))
train["Label"] = train["Label"].astype('int32')

tweets = test_tweet_reader(test_data)

test = pd.DataFrame(data=tweets, columns=("Tweet", "Label"))

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

duplicate_train_set = emojiDataset(train, tokenizer)
duplicate_test_set = emojiDataset(test, tokenizer)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

split_train_loader = DataLoader(split_train_set, **train_params)
split_test_loader = DataLoader(split_test_set, **test_params)
duplicate_train_loader = DataLoader(duplicate_train_set, **train_params)
duplicate_test_loader = DataLoader(duplicate_test_set, **test_params)

# Get Model

https://colab.research.google.com/drive/1Ek5PxTLAx6u2yQiDzVCVZpNNlno6jwaD#scrollTo=c3Q9NDdmqEyo

https://huggingface.co/cardiffnlp/twitter-roberta-base-emoji

In [None]:
class EmojiModel(torch.nn.Module):
  def __init__(self):
    super(EmojiModel, self).__init__()
    self.roberta_base = RobertaModel.from_pretrained("roberta-base")
    # roberta_base has 768 output nodes
    self.pre_classifier = torch.nn.Linear(768, 768)
    self.dropout = torch.nn.Dropout(0.3)
    self.classifier = torch.nn.Linear(768, 49)

  def forward(self, input_ids, attention_mask, token_type_ids):
    output = self.roberta_base(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    # output is the final hidden state and pooling output for each token, I only need the final hidden state
    output = output[0][:,0]
    output = self.pre_classifier(output)
    output = torch.nn.ReLU()(output)
    output = self.dropout(output)
    output = self.classifier(output)
    return output

In [None]:
split_model = EmojiModel()
split_model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EmojiModel(
  (roberta_base): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Lay

In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  split_model.parameters(), lr=LEARNING_RATE)

In [None]:
def accuracy(outputs, target):
  _, preds = torch.max(outputs, dim=1)
  return (preds==target).sum().item()

In [None]:
def accuracy_at_5(outputs, target):
  _, preds = torch.topk(outputs, 5, dim=1)
  return sum((target[i] in preds[i]) for i in range(len(target)))

In [None]:
tr_loss = 0
nb_tr_steps = 0
split_model.train()
for _,data in tqdm(enumerate(split_train_loader, 0)):
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.long)

    outputs = split_model(ids, mask, token_type_ids)
    loss = loss_function(outputs, targets)
    tr_loss += loss.item()

    nb_tr_steps += 1

    if _%5000==0:
        loss_step = tr_loss/nb_tr_steps
        print(f"Training Loss per 5000 steps: {loss_step}")

    optimizer.zero_grad()
    loss.backward()

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
2it [00:00, 10.44it/s]

Training Loss per 5000 steps: 3.9767966270446777


5002it [08:49,  9.41it/s]

Training Loss per 5000 steps: 3.8904553523327774


6359it [11:13,  9.44it/s]


In [None]:
split_model.eval()
n_correct = 0; n_correct_5 = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
with torch.no_grad():
    for _, data in tqdm(enumerate(split_test_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        outputs = split_model(ids, mask, token_type_ids)
        n_correct += accuracy(outputs.data, targets)
        n_correct_5 += accuracy_at_5(outputs.data, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%5000==0:
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Validation Accuracy per 100 steps: {accu_step}")
accu = (n_correct*100)/nb_tr_examples
accu_5 = (n_correct_5*100)/nb_tr_examples
print(f"Accuracy: {accu}")
print(f"Accuracy at 5: {accu_5}")

Accuracy: 29
Accuracy at 5: 41


In [None]:
duplicate_model = EmojiModel()
duplicate_model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EmojiModel(
  (roberta_base): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Lay

In [None]:
tr_loss = 0
nb_tr_steps = 0
duplicate_model.train()
for _,data in tqdm(enumerate(duplicate_train_loader, 0)):
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.long)

    outputs = duplicate_model(ids, mask, token_type_ids)
    loss = loss_function(outputs, targets)
    tr_loss += loss.item()

    nb_tr_steps += 1

    if _%5000==0:
        loss_step = tr_loss/nb_tr_steps
        print(f"Training Loss per 5000 steps: {loss_step}")

    optimizer.zero_grad()
    loss.backward()

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
2it [00:00, 10.18it/s]

Training Loss per 5000 steps: 3.938277244567871


5002it [08:58,  9.21it/s]

Training Loss per 5000 steps: 3.9118153334283705


7168it [12:52,  9.28it/s]


In [None]:
duplicate_model.eval()
n_correct = 0; n_correct_5 = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
with torch.no_grad():
    for _, data in tqdm(enumerate(duplicate_test_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        outputs = duplicate_model(ids, mask, token_type_ids)
        n_correct += accuracy(outputs.data, targets)
        n_correct_5 += accuracy_at_5(outputs.data, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%5000==0:
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Validation Accuracy per 100 steps: {accu_step}")
accu = (n_correct*100)/nb_tr_examples
accu_5 = (n_correct_5 *100)/nb_tr_examples
print(f"Accuracy: {accu}")
print(f"Accuracy at 5: {accu_5}")

Accuracy: 36
Accuracy at 5: 47
