In [None]:
import tensorflow as tf

device = tf.test.gpu_device_name()
if device == '/device:GPU:0':
  print('GPU Found at: {}'.format(device))
else:
  raise SystemError('GPU Device Not Found')

GPU Found at: /device:GPU:0


In [None]:
import torch
if torch.cuda.is_available():
  device = torch.device("cuda")
  print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
  device = torch.device("cpu")

We will use the GPU: Tesla T4


In [None]:
!pip install transformers
!pip install wget



In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/training.1600000.processed.noemoticon.csv' ,encoding = 'latin' ,header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
df.columns = ['sentiments', 'id' , 'date' ,'query' ,'username' ,'text']
df.drop(['id','date','query','username'] , axis = 1, inplace = True)
df.head()

Unnamed: 0,sentiments,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
df.sentiments.value_counts()

0    800000
4    800000
Name: sentiments, dtype: int64

In [None]:
df_pos = df[df['sentiments'] == 4]
df_neg = df[df['sentiments'] == 0]

num_samples = 50000

df_pos_downsampled = df_pos.sample(num_samples , random_state=2023)
df_neg_downsampled = df_neg.sample(num_samples , random_state=2023)
df_small = pd.concat([df_pos_downsampled , df_neg_downsampled])
df_small.head(10)

Unnamed: 0,sentiments,text
1381645,4,[Snoop Dogg - Gin 'N Juice] *raise glass to @A...
1021869,4,@SashiGamali lol i want all day everywhere lol
1345323,4,@rebeccablackett ahaaa here comes the aeropla...
1503851,4,@kricket_rc234 what do you need my last name f...
1065908,4,Actually i'm gonna Follow Friday @Kiiiiirsty t...
938164,4,cannot stop smiling today!
1279718,4,"and suspension by mae. fuck, this song will al..."
1527220,4,@gunjansatija will upload soon... and you can'...
1385768,4,@RhythmHippy thank you for the #followfriday
939649,4,chocolatee ice-cream Roocckss!!! i'm just eati...


In [None]:
lab_to_sentiment = {0 : 0 , 4 : 1}
def label_decoder(label):
  return lab_to_sentiment[label]
df_small.sentiments = df_small.sentiments.apply(lambda x : label_decoder(x))
df_small.head()

Unnamed: 0,sentiments,text
1381645,1,[Snoop Dogg - Gin 'N Juice] *raise glass to @A...
1021869,1,@SashiGamali lol i want all day everywhere lol
1345323,1,@rebeccablackett ahaaa here comes the aeropla...
1503851,1,@kricket_rc234 what do you need my last name f...
1065908,1,Actually i'm gonna Follow Friday @Kiiiiirsty t...


In [None]:
import re
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

def preprocess_text(text):
  text = re.sub(text_cleaning_re,' ',str(text).lower()).strip()
  tokens = []
  for token in text.split():
    tokens.append(token)
  return " ".join(tokens)

df_small.text = df_small.text.apply(lambda x : preprocess_text(x))
## verifying that text is fit for tokenization and embedding
df_small.head()

Unnamed: 0,sentiments,text
1381645,1,snoop dogg gin n juice raise glass to alonis
1021869,1,lol i want all day everywhere lol
1345323,1,ahaaa here comes the aeroplane xxx
1503851,1,what do you need my last name for
1065908,1,actually i m gonna follow friday kiiiiirsty to...


In [None]:
## Convert to numpy drrary
text = df_small.text.values
labels = df_small.sentiments.values

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

input_ids = []

for sentences in text:
  encoded_sent = tokenizer.encode(sentences , add_special_tokens=True)
  input_ids.append(encoded_sent)
print(input_ids[0])

[101, 29044, 28844, 18353, 1050, 10869, 5333, 3221, 2000, 2632, 27296, 102]


In [None]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

Max sentence length:  58


In [None]:
## Padding and attention
from keras.utils import pad_sequences
MAX_LEN = 64

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))
input_ids = pad_sequences(input_ids , maxlen = MAX_LEN ,dtype = "long" , value = 0, padding = "post")
print('\Done')

attention_masks = []
for sent in input_ids:
  att_mask = [int(token > 0) for token in sent]
  attention_masks.append(att_mask)



Padding token: "[PAD]", ID: 0
\Done


In [None]:
from sklearn.model_selection import train_test_split

x,test_inputs,y,test_labels = train_test_split(input_ids,labels,random_state=2023,test_size = 0.15)
msk,test_masks,_,_ = train_test_split(attention_masks,labels,random_state=2023,test_size = 0.15)
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(x,y,random_state=2023,test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(msk, y,random_state=2023, test_size=0.2)


In [None]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
# iterators to save memory while training (not to load entire dataset)
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs,validation_masks,validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data , sampler=validation_sampler, batch_size=batch_size)

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)
model.cuda()
## bertpool layer generates a fixed size pooled representation, improves computational complexity , pooled rep captures semantic understanding
## dropout layer prevents overfitting by randomly selecting inputs and set them to 0

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
optimizer  = AdamW(model.parameters(),
                   lr = 3e-5,
                   eps = 1e-8,
                   )
## The linear scheduling with warm-up is a common strategy where the learning rate starts low,
##gradually increases during the warm-up phase, and then decays linearly as training progresses.

from transformers import get_linear_schedule_with_warmup
epochs = 4
total_steps = epochs*len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)



In [None]:
import numpy as np

def flat_accuracy(pred, label):
  pred_flat = np.argmax(pred, axis=1).flatten()
  label_flat = label.flatten()
  return np.sum(pred_flat == label_flat)/len(label_flat)

In [None]:
import random
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

for epoch_i in range (0,epochs):
  total_loss = 0  ##reset
  model.train()
  print("")
  print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
  print('Training...')

  for step,batch in enumerate(train_dataloader):
    # copy tensor to GPU
    batch = tuple(t.to(device) for t in batch)
    b_input_ids,b_input_mask,b_labels = batch

    model.zero_grad()   ##not by default as helpful for RNNs
    outputs = model(b_input_ids,
                    token_type_ids = None,
                    attention_mask = b_input_mask,
                    labels = b_labels
                    )
    loss = outputs[0]  ##as labels are passed loss is internally computed
    total_loss += loss.item()
    loss.backward()   ##compute gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
    ##update params
    optimizer.step()
    ##change lr acc to lrs
    scheduler.step()

  avg_train_loss = total_loss/len(train_dataloader)
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  loss_values.append(avg_train_loss)

  print("Running Validation...")
  model.eval() ##dropout layers behave differently
  temp_eval_accuracy = 0
  eval_acc = 0
  nb_eval_steps = 0

  for batch in validation_dataloader:
    ##add to GPUs and unpack from dataloader
    batch = tuple(t.to(device) for t in batch)
    b_input_ids,b_input_mask,b_labels = batch
    with torch.no_grad():
      outputs = model(b_input_ids,
                      token_type_ids = None,
                      attention_mask = b_input_mask,
                      )
    logits = outputs[0]
    ##move to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    temp_eval_accuracy = flat_accuracy(logits,label_ids)
    eval_acc += temp_eval_accuracy
    nb_eval_steps += 1
  print("  Accuracy: {0:.2f}".format(eval_acc/nb_eval_steps))




Training...
  Average training loss: 0.05
Running Validation...
  Accuracy: 0.82

Training...
  Average training loss: 0.16
Running Validation...
  Accuracy: 0.83

Training...
  Average training loss: 0.12
Running Validation...
  Accuracy: 0.83

Training...
  Average training loss: 0.12
Running Validation...
  Accuracy: 0.83


In [None]:
prediction_inputs = torch.tensor(test_inputs)
prediction_masks = torch.tensor(test_masks)
prediction_labels = torch.tensor(test_labels)

batch_size = 32

prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
##Evaluation
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

model.eval()
predictions , true_labels = [],[]

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids,b_input_mask,b_labels = batch
  with torch.no_grad():
    outputs = model(b_input_ids,
                    token_type_ids = None,
                    attention_mask = b_input_mask,
                    )
  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  predictions.append(logits)
  true_labels.append(label_ids)
print('DONE.')




Predicting labels for 15,000 test sentences...
DONE.


In [None]:
from sklearn.metrics import matthews_corrcoef
matthews_set= []
for i in range(len(true_labels)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  matthews = matthews_corrcoef(true_labels[i],pred_labels_i)
  matthews_set.append(matthews)

In [None]:
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

flat_true_labels = [item for sublist in true_labels for item in sublist]
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('MCC: %.3f' % mcc)

MCC: 0.652
