This was run on Google Colab with a GPU. It shows a more state-of-the-art approach to this problem.

In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving tweets.csv to tweets.csv
User uploaded file "tweets.csv" with length 7823952 bytes


In [4]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.2-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 34.4 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.18-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.8 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 26.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 45.6 MB/s 
Collecting ruamel.yaml==0.17.16
  Downloading ruamel.yaml-0.17.16-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 48.5 MB/s 
[?25h

In [5]:
import math

import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModel, AutoTokenizer

In [6]:
tweets_df = pd.read_csv("tweets.csv")


In [7]:
tweets_df

Unnamed: 0,author,content,country,date_time,id,language,latitude,longitude,number_of_likes,number_of_shares
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,,12/01/2017 19:52,8.196330e+17,en,,,7900,3472
1,katyperry,@barackobama Thank you for your incredible gra...,,11/01/2017 08:38,8.191010e+17,en,,,3689,1380
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,,11/01/2017 02:52,8.190140e+17,en,,,10341,2387
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,,11/01/2017 02:44,8.190120e+17,en,,,10774,2458
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,,10/01/2017 05:22,8.186890e+17,en,,,17620,4655
...,...,...,...,...,...,...,...,...,...,...
52537,ddlovato,Life couldn't be better right now. 😊,,06/01/2015 23:10,5.526030e+17,en,,,32799,23796
52538,ddlovato,First Monday back in action. I'd say 21.6 mile...,,06/01/2015 02:17,5.522880e+17,en,,,21709,12511
52539,ddlovato,"Crime shows, buddy, snuggles = the perfect Sun...",,05/01/2015 03:42,5.519470e+17,en,,,25269,15583
52540,ddlovato,❄️ http://t.co/sHCFdPpGPa,,05/01/2015 00:06,5.518920e+17,und,,,15985,10456


In [8]:
unique_people = tweets_df['author'].unique()
print(unique_people)
NUM_CLASSES = len(unique_people)

# assign each person a number
id_to_person = {i: unique_people[i] for i in range(len(unique_people))}
person_to_id = {v:k for k,v in id_to_person.items()}

# create a column of author ids
tweets_df['author_id'] = tweets_df['author'].apply(lambda x: person_to_id[x])

['katyperry' 'justinbieber' 'taylorswift13' 'BarackObama' 'rihanna'
 'YouTube' 'ladygaga' 'TheEllenShow' 'Twitter' 'jtimberlake'
 'KimKardashian' 'britneyspears' 'Cristiano' 'selenagomez' 'cnnbrk'
 'jimmyfallon' 'ArianaGrande' 'shakira' 'instagram' 'ddlovato']


In [9]:
!pip install simpletransformers
!pip install tensorboardx

Collecting simpletransformers
  Downloading simpletransformers-0.62.2-py3-none-any.whl (231 kB)
[?25l[K     |█▍                              | 10 kB 22.2 MB/s eta 0:00:01[K     |██▉                             | 20 kB 19.5 MB/s eta 0:00:01[K     |████▎                           | 30 kB 16.2 MB/s eta 0:00:01[K     |█████▊                          | 40 kB 14.1 MB/s eta 0:00:01[K     |███████                         | 51 kB 5.5 MB/s eta 0:00:01[K     |████████▌                       | 61 kB 6.0 MB/s eta 0:00:01[K     |██████████                      | 71 kB 5.4 MB/s eta 0:00:01[K     |███████████▍                    | 81 kB 6.1 MB/s eta 0:00:01[K     |████████████▊                   | 92 kB 6.0 MB/s eta 0:00:01[K     |██████████████▏                 | 102 kB 5.3 MB/s eta 0:00:01[K     |███████████████▋                | 112 kB 5.3 MB/s eta 0:00:01[K     |█████████████████               | 122 kB 5.3 MB/s eta 0:00:01[K     |██████████████████▍             | 133 kB 

Collecting tensorboardx
  Downloading tensorboardX-2.4-py2.py3-none-any.whl (124 kB)
[?25l[K     |██▋                             | 10 kB 22.6 MB/s eta 0:00:01[K     |█████▎                          | 20 kB 24.1 MB/s eta 0:00:01[K     |████████                        | 30 kB 12.0 MB/s eta 0:00:01[K     |██████████▌                     | 40 kB 9.3 MB/s eta 0:00:01[K     |█████████████▏                  | 51 kB 5.4 MB/s eta 0:00:01[K     |███████████████▉                | 61 kB 5.9 MB/s eta 0:00:01[K     |██████████████████▍             | 71 kB 6.3 MB/s eta 0:00:01[K     |█████████████████████           | 81 kB 6.4 MB/s eta 0:00:01[K     |███████████████████████▊        | 92 kB 4.9 MB/s eta 0:00:01[K     |██████████████████████████▎     | 102 kB 5.4 MB/s eta 0:00:01[K     |█████████████████████████████   | 112 kB 5.4 MB/s eta 0:00:01[K     |███████████████████████████████▋| 122 kB 5.4 MB/s eta 0:00:01[K     |████████████████████████████████| 124 kB 5.4 MB/s 
In

In [10]:
from simpletransformers.classification import ClassificationModel


In [11]:
# We only do 2 epochs. More will result in a better accuracy.
model = ClassificationModel('roberta', 'roberta-base', use_cuda=True, num_labels=NUM_CLASSES, args={
    'train_batch_size': 64,
    'num_train_epochs': 5,
    'max_seq_length': 128,
    'learning_rate': 2e-5,
})

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [12]:

np.random.seed(7)

# first column should be tweet, second column should be label
df_full = tweets_df[['content', 'author_id']].sample(frac=1.0)

n_train = 10000
n_test = 10000

df_train = df_full[:n_train]
df_test = df_full[n_train:n_train+n_test]


In [13]:
model.train_model(df_train)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/157 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm


Running Epoch 1 of 5:   0%|          | 0/157 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/157 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/157 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/157 [00:00<?, ?it/s]

(785, 1.3890939660892365)

In [14]:
raw_outputs_train = model.eval_model(df_train)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/10000 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

In [15]:
# train acc
(np.argmax(raw_outputs_train[1], axis=1) == df_train['author_id'].values).mean()

0.8301

In [16]:
raw_outputs_test = model.eval_model(df_test)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/10000 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1250 [00:00<?, ?it/s]

In [17]:
# test acc
(np.argmax(raw_outputs_test[1], axis=1) == df_test['author_id'].values).mean()

0.7033