In [None]:
!pip install tweet-preprocessor
!pip install transformers



In [None]:
import pandas as pd
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import preprocessor as twpreprocessor
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import random
import numpy as np 

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
train = pd.read_csv('https://github.com/hafidhfikri/Practice-Twitter-Sentiment-Analysis/blob/master/train_E6oV3lV.csv?raw=true')
print(train.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB
None


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [None]:
limit = 20000
positives=train[train.label==0][['label','tweet']].head(limit)
negatives=train[train.label==1][['label','tweet']].head(limit)
combination=pd.concat([positives,negatives],axis=0,ignore_index=True)

twpreprocessor.set_options(twpreprocessor.OPT.EMOJI,twpreprocessor.OPT.ESCAPE_CHAR,twpreprocessor.OPT.MENTION,twpreprocessor.OPT.NUMBER,twpreprocessor.OPT.RESERVED,twpreprocessor.OPT.SMILEY,twpreprocessor.OPT.URL)
combination['tidy_tweet'] = np.vectorize(twpreprocessor.clean)(combination['tweet']) 
combination.tidy_tweet = combination.tidy_tweet.apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

tweets = combination.tweet.values
labels = combination.label.values

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tweetid = []
for tweet in tweets:
    encoded_tweet = tokenizer.encode(tweet,add_special_tokens = True)
    tweetid.append(encoded_tweet)

print('Original: ', tweets[0])
print('Token IDs:', tweetid[0])

Original:   @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run
Token IDs: [101, 1030, 5310, 2043, 1037, 2269, 2003, 28466, 2389, 1998, 2003, 2061, 14337, 2002, 8011, 2015, 2010, 4268, 2046, 2010, 28466, 1012, 1001, 2448, 102]


In [None]:
MAX_LEN = 64
print(f'\n Truncating all sentences to {MAX_LEN} values...')
print(f'\nPadding token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}')
tweetid = pad_sequences(tweetid, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")


 Truncating all sentences to 64 values...

Padding token: [PAD], ID: 0


In [None]:
masks = []
for tweet in tweetid:
    mask = [int(token_id > 0) for token_id in tweet]
    masks.append(mask)

In [None]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(tweetid, labels, random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(masks, labels, random_state=2018, test_size=0.1)

In [None]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

batch_size = 256

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 2,output_attentions = False,output_hidden_states = False)
optimizer = AdamW(model.parameters(),lr = 2e-5,eps = 1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 0,num_training_steps = total_steps)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
def accuracy(preds, labels):
  pred = np.argmax(preds, axis=1).flatten()
  labels = labels.flatten()
  return np.sum(pred == labels) / len(labels)

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []
for epoch_i in range(0, epochs):
  print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
  total_loss = 0
  model.train()
  for step, batch in enumerate(train_dataloader):
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}. '.format(step, len(train_dataloader)))
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    model.zero_grad()        

    outputs = model(b_input_ids, 
                token_type_ids=None, 
                attention_mask=b_input_mask, 
                labels=b_labels)

    loss = outputs[0]    
    total_loss += loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)            
    loss_values.append(avg_train_loss)

    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("validation")

    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():        
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))

  Average training loss: 0.01
validation
  Average training loss: 0.02
validation
  Average training loss: 0.02
validation
  Average training loss: 0.03
validation
  Average training loss: 0.04
validation
  Average training loss: 0.04
validation
  Average training loss: 0.05
validation
  Average training loss: 0.05
validation
  Average training loss: 0.06
validation
  Average training loss: 0.06
validation
  Average training loss: 0.06
validation
  Average training loss: 0.07
validation
  Average training loss: 0.07
validation
  Average training loss: 0.08
validation
  Average training loss: 0.08
validation
  Average training loss: 0.08
validation
  Average training loss: 0.09
validation
  Average training loss: 0.09
validation
  Average training loss: 0.09
validation
  Average training loss: 0.10
validation
  Average training loss: 0.10
validation
  Average training loss: 0.10
validation
  Average training loss: 0.11
validation
  Average training loss: 0.11
validation
  Average traini