In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 19.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 53.5 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 58.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0


In [3]:
import random as python_random
import argparse
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle
import warnings
from tensorflow import keras
from keras.layers import LSTM, Activation, Dropout, Dense, Input, CuDNNLSTM
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import BatchNormalization
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score, classification_report
from keras_preprocessing.sequence import pad_sequences
import transformers
from transformers import (OpenAIGPTTokenizer, TFOpenAIGPTForSequenceClassification, MobileBertTokenizer,
                          TFMobileBertForSequenceClassification, TFAutoModelForSequenceClassification,
                          AutoTokenizer, BertTokenizerFast, TFBertForSequenceClassification,
                          DistilBertTokenizer, TFDistilBertForSequenceClassification,
                          RobertaTokenizer, TFRobertaForSequenceClassification,
                          XLNetTokenizer, TFXLNetForSequenceClassification)

warnings.filterwarnings("ignore")


In [4]:


def train_model(model, tokens_train, Y_train_bin, tokens_dev, Y_dev_bin,encoder,output_file):
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
    model.fit(tokens_train, Y_train_bin, verbose=1, epochs=10,batch_size=32,callbacks=[callback], validation_data=(tokens_dev, Y_dev_bin))
    test_set_predict(model, tokens_dev, Y_dev_bin, "dev",encoder,output_file)
    return model

In [11]:
def test_set_predict(model,tokens_dev, Y_dev_bin, ident,encoder,output_file):
    '''Do predictions and measure accuracy on our own test set (that we split off train)'''
    # Get predictions using the trained model
    Y_pred = model.predict(tokens_dev)["logits"]
    Y_pred = Y_pred>0.5
    if output_file:
      pd.DataFrame(Y_pred).to_csv('/content/gdrive/MyDrive/Data/outputtest_dbert.csv')
    
    print(classification_report(Y_dev_bin, Y_pred,target_names= ["OFF",'NOT']))
    print('Accuracy on own {1} set: {0}'.format(round(accuracy_score(Y_dev_bin, Y_pred), 3), ident))

In [12]:
if __name__ == "__main__":
  train = pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_train.csv')
  val= pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_val.csv')
  X_train, Y_train = train['tweet'], train['task']
  X_dev, Y_dev = val['tweet'], val['task']

  encoder = LabelBinarizer()
  encoder = encoder.fit(Y_train.tolist())
  Y_train_bin = encoder.transform(Y_train.tolist())
  # Use encoder.classes_ to find mapping back
  Y_dev_bin = encoder.transform(Y_dev.tolist())
  filename = "/content/gdrive/MyDrive/Data/bert"
  bert_pretrained= False
  custom_test_set = False
  output_file = True
  val_set = False
  if bert_pretrained:
    model = TFAutoModelForSequenceClassification.from_pretrained(filename, local_files_only=True)
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokenizer.pad_token = "[PAD]"
    tokens_dev = tokenizer(X_dev.values.tolist(), padding=True, max_length=100, truncation=True,
                           return_tensors="np").data
  else:
    # lm = "bert-base-uncased"
    # lm= "roberta-base"
    lm = "distilbert-base-uncased"
    optim = Adam(learning_rate=5e-5)
    loss_function = tf.keras.losses.BinaryCrossentropy(from_logits=True) 
    tokenizer = AutoTokenizer.from_pretrained(lm)
    model = TFAutoModelForSequenceClassification.from_pretrained(lm, num_labels=1)
    tokens_train = tokenizer(X_train.tolist(), padding=True, max_length=100,truncation=True, return_tensors="np").data
    tokens_dev = tokenizer(X_dev.tolist(), padding=True, max_length=100,truncation=True, return_tensors="np").data
    model.compile(loss=loss_function, optimizer=optim, metrics=['accuracy'])
    model = train_model(model, tokens_train, Y_train_bin, tokens_dev, Y_dev_bin,output_file,encoder)
    test_file = True
    if test_file:
      test = pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_test.csv')
      X_test, Y_test = test['tweet'], test['task']
      Y_test_bin = encoder.fit_transform(Y_test)
      tokens_test = tokenizer(X_test.tolist(), padding=True, max_length=100,truncation=True, return_tensors="np").data
      test_set_predict(model, tokens_test, Y_test_bin, "test",encoder,output_file)
    model.save_pretrained(filename)



Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_layer_norm', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_133', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
              precision    recall  f1-score   support

         OFF       0.85      0.81      0.83       647
         NOT       0.67      0.73      0.70       352

    accuracy                           0.78       999
   macro avg       0.76      0.77      0.76       999
weighted avg       0.79      0.78      0.78       999

Accuracy on own dev set: 0.781
              precision    recall  f1-score   support

         OFF       0.88      0.89      0.89       620
         NOT       0.72      0.69      0.70       239

    accuracy                           0.84       859
   macro avg       0.80      0.79      0.80       859
weighted avg       0.84      0.84      0.84       859

Accuracy on own test set: 0.838
