<a href="https://colab.research.google.com/github/digitalepidemiologylab/covid-twitter-bert/blob/master/CT_BERT_Huggingface_(GPU_training).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img align="right" width="450px" src="https://github.com/digitalepidemiologylab/covid-twitter-bert/raw/master/images/COVID-Twitter-BERT-medium.png">

# Finetuning COVID-Twitter-BERT using Huggingface
In this notebook we will finetune CT-BERT for sentiment classification using the transformer library by Huggingface.

Learn more about this library [here](https://huggingface.co/transformers/).

## Before proceeding
Create a copy of this notebook by going to "File - Save a Copy in Drive"


# Install transformers and import libraries

In [1]:
from transformers import (
   AutoConfig,
   AutoTokenizer,
   TFAutoModelForSequenceClassification,
   AdamW,
   glue_convert_examples_to_features
)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import sklearn
import os
import re
import itertools
import emoji
#import tensorflow_datasets as tfds
import json

# Choose a Model from the Huggingface Library

In [2]:
# Choose model
# @markdown >The default model is <i><b>COVID-Twitter-BERT</b></i>. You can however choose <i><b>BERT Base</i></b> or <i><b>BERT Large</i></b> to compare these models to the <i><b>COVID-Twitter-BERT</i></b>. All these three models will be initiated with a random classification layer. If you go directly to the Predict-cell after having compiled the model, you will see that it still runs the predition. However the output will be random. The training steps below will finetune this for the specific task. <br /><br /> 
model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' #@param ["digitalepidemiologylab/covid-twitter-bert", "bert-large-uncased", "bert-base-uncased"]

# Initialise tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Loadind and preparing training dataset and setting parameters

In [3]:
"""
# Paramteters
#@markdown >Batch size and sequence length needs to be set to prepare the data. The size of the batches depends on available memory. For Colab GPU limit batch size to 8 and sequence length to 96. By reducing the length of the input (max_seq_length) you can also increase the batch size. For a dataset like SST-2 with lots of short sentences. this will likely benefit training.
max_seq_length = 96 #@param {type: "integer"}
train_batch_size =  8#@param {type: "integer"} 
eval_batch_size = 8 #@param {type: "integer"}


#@markdown >The Glue dataset has around 62000 examples, and we really do not need them all for training a decent model. To cut down training time, please reduse this to only a percentage of the entire set.
use_percentage_of_data = 5 #@param {type: "slider", min: 1, max: 100}

# get dataset sizes
glue_builder = tfds.builder('glue/sst2')
num_train_examples = glue_builder.info.splits['train'].num_examples
num_dev_examples = glue_builder.info.splits['validation'].num_examples
num_labels = glue_builder.info.features['label'].num_classes

# download datasets and convert to training features
glue_builder.download_and_prepare()
train_data = glue_builder.as_dataset(split='train')
train_dataset = glue_convert_examples_to_features(train_data, tokenizer, max_length=max_seq_length, task='sst-2')
train_dataset = train_dataset.shuffle(100).batch(train_batch_size)

dev_data = glue_builder.as_dataset(split='validation')
dev_dataset = glue_convert_examples_to_features(dev_data, tokenizer, max_length=max_seq_length, task='sst-2')
dev_dataset = dev_dataset.shuffle(100).batch(eval_batch_size)

# Map the labels for printing
label_mapping = {i: glue_builder.info.features['label'].int2str(i) for i in range(num_labels)}

print(f'\n\nThe dataset is downloaded. The entire dataset has {num_train_examples + num_dev_examples} examples of which you are using {use_percentage_of_data}%. This will result in a train dataset with {int(num_train_examples * (use_percentage_of_data/100))} examples and a validation dataset with {int(num_dev_examples * (use_percentage_of_data/100))} examples.')
"""

'\n# Paramteters\n#@markdown >Batch size and sequence length needs to be set to prepare the data. The size of the batches depends on available memory. For Colab GPU limit batch size to 8 and sequence length to 96. By reducing the length of the input (max_seq_length) you can also increase the batch size. For a dataset like SST-2 with lots of short sentences. this will likely benefit training.\nmax_seq_length = 96 #@param {type: "integer"}\ntrain_batch_size =  8#@param {type: "integer"} \neval_batch_size = 8 #@param {type: "integer"}\n\n\n#@markdown >The Glue dataset has around 62000 examples, and we really do not need them all for training a decent model. To cut down training time, please reduse this to only a percentage of the entire set.\nuse_percentage_of_data = 5 #@param {type: "slider", min: 1, max: 100}\n\n# get dataset sizes\nglue_builder = tfds.builder(\'glue/sst2\')\nnum_train_examples = glue_builder.info.splits[\'train\'].num_examples\nnum_dev_examples = glue_builder.info.spli

In [3]:
# Training Paremeters
max_seq_length = 128 #@param {type: "integer"}
train_batch_size =  8 #@param {type: "integer"} 
eval_batch_size = 8 #@param {type: "integer"}
num_labels = 2

# Loading the Training dataset
t_train = pd.read_csv("..\\raw\\covid_labeled.csv", usecols=["text", "target"])
#t_train = t_train.sample(n=30000)

#t_train["text"] = t_train["text"].apply(remove_contractions)
#t_train["text"] = t_train["text"].apply(clean_text)
#t_train.drop_duplicates(subset=["text"], inplace=True)
#t_train.dropna(inplace=True)

#X_train = t_train["text"][:len(df)*0.64]
#X_val = t_train["text"][len(df)*0.64:len(df)*0.8]
#X_test = t_train["text"][len(df)*0.8:]

X_train, X_test, y_train, y_test = train_test_split(t_train['text'], t_train['target'], test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=1) # 0.25 x 0.8 = 0.2

train_text = []
val_text = []
test_text = []

train_label = []
val_label = []
test_label = []

for row in X_train:
    train_text.append(str(row))
    
for row in X_val:
    val_text.append(str(row))
    
for row in X_test:
    test_text.append(str(row))
    
for row in y_train:
    train_label.append(int(row))

for row in y_val:
    val_label.append(int(row))
    
for row in y_test:
    test_label.append(int(row))

#train_text = tokenizer(train_text, max_length=max_seq_length, truncation=True, padding=True)
#val_text = tokenizer(val_text, max_length=max_seq_length, truncation=True, padding=True)
#test_text = tokenizer(test_text, max_length=max_seq_length, truncation=True, padding=True)

train = np.zeros([np.size(train_text), max_seq_length], dtype=int)
val = np.zeros([np.size(val_text), max_seq_length], dtype=int)

for i in range(len(train_text)):
    tokens = np.asarray(tokenizer.encode(train_text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        train[i][j] = tokens[j]

for i in range(len(val_text)):
    tokens = np.asarray(tokenizer.encode(val_text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        val[i][j] = tokens[j]


#a = tokenizer.encode(train_text[0], max_length=max_seq_length, truncation=True, padding=True)
#print(np.shape(a))


#train_text = np.asarray(train_text)
#val_text = np.asarray(val_text)
#test_text = np.asarray(test_text)

train_label = np.asarray(train_label)
val_label = np.asarray(val_label)
test_label = np.array(test_label)

label_mapping = {0: "Real", 1: "Fake"}

In [4]:
#train_t = tf.convert_to_tensor(train_text)
#val_t = tf.convert_to_tensor(val_text)
#test_t = tf.convert_to_tensor(test_text)
#train_l = tf.convert_to_tensor(train_label)
#val_l = tf.convert_to_tensor(val_label)
#test_l = tf.convert_to_tensor(test_label)
print(np.shape(train_text))

np.shape(train)
np.shape(train_label)

#print(np.shape(train[0]))
#print(np.shape(train[5]))
print(train[0])
print(train[5])
#print(train_text)

(15638,)
[  101  6396  7917  9907  4341  2138  1997  1996 21887 23350  1012  3099
  4080 12731 19506  2003  6195  1037  5741  2416  2733  7221  2006  1996
  5096  1997 22863 19966  7028 15001  1999  2344  2000  5547  1996  2163
 21887 23350  2331  4175  1037  3120  5220  2007  2010  3241  4136  1996
  9519  1012  1996  7221  2071  2272  2004  2574  2004  6928  1998  2003
  3517  2000  2022  2443  1999  1037  5166  5468  2525  5115  2000  2022
  2623  1012  2047  2951  2041  1997  3304  6592  2008  1996  3741  2040
 14853 21887 23350  2331  3446  2085  8455  2184  1997  2216  2040  3231
  3893  2003  3811 23900  2000 19135  6582  1012   102     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
[  101  7609  6090  3207  7712  1024  5604  2111  2005  2151 10178  1997
  1037 21887 23350  1010  2025  4919  2005  2522 17258  1011  2539 16770
  1024  1013  1013  1056  1012  2522  1013  1017  2615  2595  616

# Compile the Model, Train it on the SST-2 Task and Save the Result
You can skip this step if you are using the already finetuned model

In [4]:
#@markdown >The default learning rate of 2e5 will be fine in most cases
learning_rate = 2e-5 #@param {type: "number"}

#@markdown > Typically these type of models are finetuned for 3 epochs. This can be increased for small datasets and decreased for large datasets.
num_epochs = 3 #@param {type: "integer"}

# Initialise a Model for Sequence Classification with 2 labels
config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, config=config)

# Optimizer and loss
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

# Metrics and callbacks no false negatives bitch
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
#metrics = [tf.keras.metrics.Recall(), tf.keras.metrics.Precision()]
checkpoint_path = '..\\cvb\\checkpoints\\checkpoint.{epoch:02d}'
callbacks = [tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_weights_only=True)]

# Compute some variables
use_percentage_of_data = 100
train_steps_per_epoch = int(np.size(train[:,0]) * (use_percentage_of_data/100) / train_batch_size)
dev_steps_per_epoch = int(np.size(val[:,0]) * (use_percentage_of_data/100) / eval_batch_size)
#train_steps_per_epoch = 100
#dev_steps_per_epoch = 100


# Compile model
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Train the model
history = model.fit(train, train_label, epochs=num_epochs, steps_per_epoch=train_steps_per_epoch, validation_data=(val, val_label), validation_steps=dev_steps_per_epoch, batch_size=8, callbacks=callbacks)

'''
# Print some information about the training
print(f'\nThe training has finished training after {num_epochs} epochs.')
print('\nThe history contains the accuracy and loss at every epoch:')
print(json.dumps(history.history, indent=4))

print('\nThe checkpoint callback has generated a checkpoint after every epoch (loss being the training loss, val_loss is the validation loss):')
!ls -lha ./checkpoints/

print('\nWe will now save the finetuned model and the corresponding config file on your Colab disk.')
model.save_pretrained('..\\cvb\\huggingface_model\\')

print('\nTensorflow model and config-file is saved in ./huggingface_model/')
!ls -lha ./huggingface_model/
'''

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert-v2 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/3
Epoch 3/3


"\n# Print some information about the training\nprint(f'\nThe training has finished training after {num_epochs} epochs.')\nprint('\nThe history contains the accuracy and loss at every epoch:')\nprint(json.dumps(history.history, indent=4))\n\nprint('\nThe checkpoint callback has generated a checkpoint after every epoch (loss being the training loss, val_loss is the validation loss):')\n!ls -lha ./checkpoints/\n\nprint('\nWe will now save the finetuned model and the corresponding config file on your Colab disk.')\nmodel.save_pretrained('..\\cvb\\huggingface_model\\')\n\nprint('\nTensorflow model and config-file is saved in ./huggingface_model/')\n!ls -lha ./huggingface_model/\n"

In [5]:
# Print some information about the training
print(f'\nThe training has finished training after {num_epochs} epochs.')
print('\nThe history contains the accuracy and loss at every epoch:')
print(json.dumps(history.history, indent=4))

print('\nThe checkpoint callback has generated a checkpoint after every epoch (loss being the training loss, val_loss is the validation loss):')
!ls -lha ./cvb/checkpoints/

#print('\nWe will now save the finetuned model and the corresponding config file on your Colab disk.')
#model.save_pretrained('..\\cvb\\huggingface_model\\')

#print('\nTensorflow model and config-file is saved in ./huggingface_model/')
#!ls -lha ./cvb/huggingface_model/


The training has finished training after 3 epochs.

The history contains the accuracy and loss at every epoch:
{
    "loss": [
        0.10978447645902634,
        0.0614316463470459,
        0.07552564889192581
    ],
    "accuracy": [
        0.9575870037078857,
        0.9802942872047424,
        0.980486273765564
    ],
    "val_loss": [
        0.07738339900970459,
        0.07724835723638535,
        0.04962364211678505
    ],
    "val_accuracy": [
        0.9692623019218445,
        0.9769467115402222,
        0.9846311211585999
    ]
}

The checkpoint callback has generated a checkpoint after every epoch (loss being the training loss, val_loss is the validation loss):


'ls' is not recognized as an internal or external command,
operable program or batch file.


# Predict
Let's run some inference with the trained model

In [6]:

# Small function only used for formatting the output
def format_prediction(preds, label_mapping, label_name):
    preds = tf.nn.softmax(preds, axis=1)
    formatted_preds = []
    for pred in preds.numpy():
        # convert to Python types and sort
        pred = {label: float(probability) for label, probability in zip(label_mapping.values(), pred)}
        pred = {k: v for k, v in sorted(pred.items(), key=lambda item: item[1], reverse=True)}
        formatted_preds.append({label_name: list(pred.keys())[0], f'{label_name}_probabilities': pred})
    return formatted_preds

In [9]:
'''
#@markdown >Please input text that the model can try to classify
input_text = 'Happy little clouds'  #@param {type: "string"}

# Tokenize the input 
input_ids = tf.constant(tokenizer.encode(input_text, add_special_tokens=True))[None, :]

# Run predictions
preds = model(input_ids)

# format logits
formatted_preds = format_prediction(preds[0], label_mapping, 'sentiment')

print(f'\nLabel Mapping:{json.dumps(label_mapping, indent=4)}')
print(f'\nLogits: {preds}')
print(f'\nProbabilities:{json.dumps(formatted_preds, indent=4)}')
'''

'\n#@markdown >Please input text that the model can try to classify\ninput_text = \'Happy little clouds\'  #@param {type: "string"}\n\n# Tokenize the input \ninput_ids = tf.constant(tokenizer.encode(input_text, add_special_tokens=True))[None, :]\n\n# Run predictions\npreds = model(input_ids)\n\n# format logits\nformatted_preds = format_prediction(preds[0], label_mapping, \'sentiment\')\n\nprint(f\'\nLabel Mapping:{json.dumps(label_mapping, indent=4)}\')\nprint(f\'\nLogits: {preds}\')\nprint(f\'\nProbabilities:{json.dumps(formatted_preds, indent=4)}\')\n'

In [55]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import f1_score

def evaluation_summary(description, predictions, true_labels):
    print("Evaluation for: " + description)
    precision = precision_score(predictions, true_labels, average='macro')
    recall = recall_score(predictions, true_labels, average='macro')
    accuracy = accuracy_score(predictions, true_labels)
    f1 = f1_score(predictions, true_labels, average='macro')
    #f1 = fbeta_score(predictions, true_labels, 1, average='macro') #1 means f_1 measure
    print("Classifier '%s' has Acc=%0.3f P=%0.3f R=%0.3f F1=%0.3f" % (description,accuracy,precision,recall,f1))
    print(classification_report(predictions, true_labels, digits=3, zero_division=0))
    print('\nConfusion matrix:\n',confusion_matrix(true_labels, predictions))

In [35]:
test = np.zeros([np.size(test_text), max_seq_length], dtype=int)

for i in range(len(test_text)):
    tokens = np.asarray(tokenizer.encode(test_text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        test[i][j] = tokens[j]

In [37]:
print(np.shape(test))

(4888, 128)


In [38]:
test_pred = model.predict(test)

In [41]:
formatted_test_preds = format_prediction(test_pred[0], label_mapping, 'fake?')

In [42]:
formatted_test_preds[0]

{'fake?': 'Fake',
 'fake?_probabilities': {'Fake': 0.9950114488601685,
  'Real': 0.004988573025912046}}

In [49]:
test_pred_labels = []
for i in range(len(test_text)):
    if (formatted_test_preds[i]['fake?_probabilities']['Real'] > 0.5):
        test_pred_labels.append(0)
    else:
        test_pred_labels.append(1)

(0,)


In [56]:
evaluation_summary("Covid-Twitter-Bert-v2", test_pred_labels, test_label)

Evaluation for: Covid-Twitter-Bert-v2
Classifier 'Covid-Twitter-Bert-v2' has Acc=0.984 P=0.983 R=0.985 F1=0.984
              precision    recall  f1-score   support

           0      0.995     0.974     0.985      2663
           1      0.970     0.995     0.982      2225

    accuracy                          0.984      4888
   macro avg      0.983     0.985     0.984      4888
weighted avg      0.984     0.984     0.984      4888


Confusion matrix:
 [[2595   12]
 [  68 2213]]


### Jan2020

In [7]:
dfA = pd.read_csv("..\\raw\\tweets\\Jan_2020A.csv", usecols=["text"])
dfB = pd.read_csv("..\\raw\\tweets\\Jan_2020B.csv", usecols=["text"])
df = pd.concat([dfA, dfB], ignore_index=True)

text = df["text"]

#for idx, row in df.iterrows():
    #text.append(str(row))
    

        
#predictions = []
#for p in text_in:
    #predictions.append(model(p))

#df.head()

In [8]:
text[0]

'Fast action will be key to containing new coronavirus from\xa0China https://t.co/QIIzHwPkNp https://t.co/N9QQvgTZFD'

In [14]:
#from IPython.display import clear_output

#predictions = []
#for i in range(len(text)):
    #print(i)
    #predictions.append(model(text[i]))
    #clear_output()

In [9]:
f_text = []
for i in range(len(text)):
    f_text.append(text[i][8:])

In [10]:
text_in = np.zeros([len(text), 128], dtype=int)
for i in range(len(text)):
    tokens = np.asarray(tokenizer.encode(text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        text_in[i][j] = tokens[j]



In [101]:
text_in[0]

array([  101,  3435,  2895,  2097,  2022,  3145,  2000,  4820,  2047,
       21887, 23350,  2013,  2859, 16770,  1024,  1013,  1013,  1056,
        1012,  2522,  1013, 18816, 10993,  2232,  2860,  2361,  2243,
       16275, 16770,  1024,  1013,  1013,  1056,  1012,  2522,  1013,
        1050,  2683,  4160,  4160,  2615, 13512,  2480,  2546,  2094,
         102,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0]

In [11]:
predictions = model.predict(text_in)



In [12]:
label_mapping = {0: "Real", 1: "Fake"}
formatted_preds = format_prediction(predictions[0], label_mapping, 'fake?')

In [18]:
text[25002]

'IHR Emergency Committee on Novel Coronavirus (2019-nCoV) #nCoV declared as #PHEIC2020 #nCoVPHEIC  https://t.co/R630yHJi7d'

In [13]:
formatted_preds[25002]

{'fake?': 'Real',
 'fake?_probabilities': {'Real': 0.9736402630805969,
  'Fake': 0.026359790936112404}}

In [14]:
formatted_preds[25002]

{'fake?': 'Real',
 'fake?_probabilities': {'Real': 0.9736402630805969,
  'Fake': 0.026359790936112404}}

In [15]:
pred_labels = []
for i in range(len(text)):
    if (formatted_preds[i]['fake?_probabilities']['Real'] > 0.5):
        pred_labels.append(0)
    else:
        pred_labels.append(1)

In [16]:
pred_labels[25002]

0

In [23]:
text[5]

'Want to do something to battle the #coronavirus? Either drop off or mail (labeled, "destined for China") goggles, biohazard suits and surgical or N95 masks to ChinaSF 135 Market Street Suite 488 San Francisco, CA. Thanks @ChinaSF for organizing!'

In [24]:
zipped = list(zip(text, pred_labels))
df_out = pd.DataFrame(zipped, columns=['Text', 'Label'])

In [25]:
df_out.head()

Unnamed: 0,Text,Label
0,Fast action will be key to containing new coro...,0
1,That CoronaVirus is about to turn into that Ri...,1
2,It be the ones who have the worst hygiene that...,1
3,This has been a day:\n\n• No new witnesses: ht...,1
4,y’all @itslbern really thinks i have the coron...,1


In [26]:
df_out.to_csv('..\\processed\\Jan2020.csv', header = False, index = False)

### Apr2020

In [27]:
dfA = pd.read_csv("..\\raw\\tweets\\Apr_2020A.csv", usecols=["text"])
dfB = pd.read_csv("..\\raw\\tweets\\Apr_2020B.csv", usecols=["text"])
df = pd.concat([dfA, dfB], ignore_index=True)

text = df["text"]
    
text_in = np.zeros([len(text), 128], dtype=int)
for i in range(len(text)):
    tokens = np.asarray(tokenizer.encode(text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        text_in[i][j] = tokens[j]
    
predictions = model.predict(text_in)

label_mapping = {0: "Real", 1: "Fake"}
formatted_preds = format_prediction(predictions[0], label_mapping, 'fake?')

pred_labels = []
for i in range(len(text)):
    if (formatted_preds[i]['fake?_probabilities']['Real'] > 0.5):
        pred_labels.append(0)
    else:
        pred_labels.append(1)

zipped = list(zip(text, pred_labels))
df_out = pd.DataFrame(zipped, columns=['Text', 'Label'])
df_out.to_csv('..\\processed\\Apr2020.csv', header = False, index = False)

### Jul2020

In [28]:
dfA = pd.read_csv("..\\raw\\tweets\\Jul_2020A.csv", usecols=["text"])
dfB = pd.read_csv("..\\raw\\tweets\\Jul_2020B.csv", usecols=["text"])
df = pd.concat([dfA, dfB], ignore_index=True)

text = df["text"]
    
text_in = np.zeros([len(text), 128], dtype=int)
for i in range(len(text)):
    tokens = np.asarray(tokenizer.encode(text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        text_in[i][j] = tokens[j]
    
predictions = model.predict(text_in)

label_mapping = {0: "Real", 1: "Fake"}
formatted_preds = format_prediction(predictions[0], label_mapping, 'fake?')

pred_labels = []
for i in range(len(text)):
    if (formatted_preds[i]['fake?_probabilities']['Real'] > 0.5):
        pred_labels.append(0)
    else:
        pred_labels.append(1)

zipped = list(zip(text, pred_labels))
df_out = pd.DataFrame(zipped, columns=['Text', 'Label'])
df_out.to_csv('..\\processed\\Jul2020.csv', header = False, index = False)

### Oct2020

In [29]:
dfA = pd.read_csv("..\\raw\\tweets\\Oct_2020A.csv", usecols=["text"])
dfB = pd.read_csv("..\\raw\\tweets\\Oct_2020B.csv", usecols=["text"])
df = pd.concat([dfA, dfB], ignore_index=True)

text = df["text"]
    
text_in = np.zeros([len(text), 128], dtype=int)
for i in range(len(text)):
    tokens = np.asarray(tokenizer.encode(text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        text_in[i][j] = tokens[j]
    
predictions = model.predict(text_in)

label_mapping = {0: "Real", 1: "Fake"}
formatted_preds = format_prediction(predictions[0], label_mapping, 'fake?')

pred_labels = []
for i in range(len(text)):
    if (formatted_preds[i]['fake?_probabilities']['Real'] > 0.5):
        pred_labels.append(0)
    else:
        pred_labels.append(1)

zipped = list(zip(text, pred_labels))
df_out = pd.DataFrame(zipped, columns=['Text', 'Label'])
df_out.to_csv('..\\processed\\Oct2020.csv', header = False, index = False)

### Jan2021

In [30]:
dfA = pd.read_csv("..\\raw\\tweets\\Jan_2021A.csv", usecols=["text"])
dfB = pd.read_csv("..\\raw\\tweets\\Jan_2021B.csv", usecols=["text"])
df = pd.concat([dfA, dfB], ignore_index=True)

text = df["text"]
    
text_in = np.zeros([len(text), 128], dtype=int)
for i in range(len(text)):
    tokens = np.asarray(tokenizer.encode(text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        text_in[i][j] = tokens[j]
    
predictions = model.predict(text_in)

label_mapping = {0: "Real", 1: "Fake"}
formatted_preds = format_prediction(predictions[0], label_mapping, 'fake?')

pred_labels = []
for i in range(len(text)):
    if (formatted_preds[i]['fake?_probabilities']['Real'] > 0.5):
        pred_labels.append(0)
    else:
        pred_labels.append(1)

zipped = list(zip(text, pred_labels))
df_out = pd.DataFrame(zipped, columns=['Text', 'Label'])
df_out.to_csv('..\\processed\\Jan2021.csv', header = False, index = False)

### Apr2021

In [31]:
dfA = pd.read_csv("..\\raw\\tweets\\Apr_2021A.csv", usecols=["text"])
dfB = pd.read_csv("..\\raw\\tweets\\Apr_2021B.csv", usecols=["text"])
df = pd.concat([dfA, dfB], ignore_index=True)

text = df["text"]
    
text_in = np.zeros([len(text), 128], dtype=int)
for i in range(len(text)):
    tokens = np.asarray(tokenizer.encode(text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        text_in[i][j] = tokens[j]
    
predictions = model.predict(text_in)

label_mapping = {0: "Real", 1: "Fake"}
formatted_preds = format_prediction(predictions[0], label_mapping, 'fake?')

pred_labels = []
for i in range(len(text)):
    if (formatted_preds[i]['fake?_probabilities']['Real'] > 0.5):
        pred_labels.append(0)
    else:
        pred_labels.append(1)

zipped = list(zip(text, pred_labels))
df_out = pd.DataFrame(zipped, columns=['Text', 'Label'])
df_out.to_csv('..\\processed\\Apr2021.csv', header = False, index = False)

### Jul2021

In [32]:
dfA = pd.read_csv("..\\raw\\tweets\\Jul_2021A.csv", usecols=["text"])
dfB = pd.read_csv("..\\raw\\tweets\\Jul_2021B.csv", usecols=["text"])
df = pd.concat([dfA, dfB], ignore_index=True)

text = df["text"]
    
text_in = np.zeros([len(text), 128], dtype=int)
for i in range(len(text)):
    tokens = np.asarray(tokenizer.encode(text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        text_in[i][j] = tokens[j]
    
predictions = model.predict(text_in)

label_mapping = {0: "Real", 1: "Fake"}
formatted_preds = format_prediction(predictions[0], label_mapping, 'fake?')

pred_labels = []
for i in range(len(text)):
    if (formatted_preds[i]['fake?_probabilities']['Real'] > 0.5):
        pred_labels.append(0)
    else:
        pred_labels.append(1)

zipped = list(zip(text, pred_labels))
df_out = pd.DataFrame(zipped, columns=['Text', 'Label'])
df_out.to_csv('..\\processed\\Jul2021.csv', header = False, index = False)

### Oct2021

In [33]:
dfA = pd.read_csv("..\\raw\\tweets\\Oct_2021A.csv", usecols=["text"])
dfB = pd.read_csv("..\\raw\\tweets\\Oct_2021B.csv", usecols=["text"])
df = pd.concat([dfA, dfB], ignore_index=True)

text = df["text"]
    
text_in = np.zeros([len(text), 128], dtype=int)
for i in range(len(text)):
    tokens = np.asarray(tokenizer.encode(text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        text_in[i][j] = tokens[j]
    
predictions = model.predict(text_in)

label_mapping = {0: "Real", 1: "Fake"}
formatted_preds = format_prediction(predictions[0], label_mapping, 'fake?')

pred_labels = []
for i in range(len(text)):
    if (formatted_preds[i]['fake?_probabilities']['Real'] > 0.5):
        pred_labels.append(0)
    else:
        pred_labels.append(1)

zipped = list(zip(text, pred_labels))
df_out = pd.DataFrame(zipped, columns=['Text', 'Label'])
df_out.to_csv('..\\processed\\Oct2021.csv', header = False, index = False)

##### Copyright 2020 Per Egil Kummervold and Martin Müller