<a href="https://colab.research.google.com/github/pythonuzgit/elmurodov/blob/master/Tweet_emotions_analysis_with_Hugging_Face_using_PyTprch_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

import transformers
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
#from ignite.metrics import Accuracy, Precision, Recall, Fbeta

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from collections import defaultdict

import torch
import pandas as pd
from tqdm.notebook import trange, tqdm


In [7]:
df = pd.read_csv('/content/tweet_emotions.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [8]:
df.set_index('tweet_id', inplace = True)
df.head()

Unnamed: 0_level_0,sentiment,content
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
1956967696,sadness,Funeral ceremony...gloomy friday...
1956967789,enthusiasm,wants to hang out with friends SOON!
1956968416,neutral,@dannycastillo We want to trade with someone w...


In [9]:
df.isnull().sum()

sentiment    0
content      0
dtype: int64

In [10]:
df.sentiment.value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [11]:
df = df[df.sentiment.isin(['neutral', 'worry', 'happiness', 'sadness', 'love', 'surprise',
                           'fun', 'relief', 'hate', 'empty', 'enthusiam', 'boredom', 'anger'])]

In [12]:
df.sentiment.value_counts()

neutral      8638
worry        8459
happiness    5209
sadness      5165
love         3842
surprise     2187
fun          1776
relief       1526
hate         1323
empty         827
boredom       179
anger         110
Name: sentiment, dtype: int64

In [13]:
possible_labels = df.sentiment.unique()

In [14]:
label_dict = {}

for index, possible_label in enumerate(possible_labels):
  label_dict[possible_label] = index

In [15]:
label_dict

{'anger': 11,
 'boredom': 9,
 'empty': 0,
 'fun': 6,
 'happiness': 8,
 'hate': 7,
 'love': 5,
 'neutral': 2,
 'relief': 10,
 'sadness': 1,
 'surprise': 4,
 'worry': 3}

In [16]:
df.sentiment = df['sentiment'].map(label_dict)
df.head()

Unnamed: 0_level_0,sentiment,content
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1956967341,0,@tiffanylue i know i was listenin to bad habi...
1956967666,1,Layin n bed with a headache ughhhh...waitin o...
1956967696,1,Funeral ceremony...gloomy friday...
1956968416,2,@dannycastillo We want to trade with someone w...
1956968477,3,Re-pinging @ghostridah14: why didn't you go to...


Create the function to preprocess every tweet

In [17]:

def process_content(content):
  """Process tweet function.
  Input:
      tweet: a string containing a tweet
  Output:
      tweets_clean: a list of words containing the processed tweet

  """
  # remove old style retweet text "RT"
  content = re.sub(r'^RT[\s]+', '', content)
  # remove hyperlinks
  content = re.sub(r'https?:\/\/.*[\r\n]*', '', content)
  content = re.sub(r'#', '', content)
  # removing hyphens
  content = re.sub('-', ' ', content)
  # remove linebreaks
  content = re.sub('<br\s?\/>|<br>', "", content)
  # remving numbers
  content = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b",'', content)

  # tokenize tweets
  tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True,
                              reduce_len=True)
  tweet_tokens = tokenizer.tokenize(content)

  # remove numbers
  tweet_tokens = [i for i in tweet_tokens if not i.isdigit()]

  tweets_clean = []
  for word in tweet_tokens:
    tweets_clean.append(word)

  return ' '.join(tweets_clean)

In [18]:
df['content'] = df['content'].apply(process_content)
df.head()

Unnamed: 0_level_0,sentiment,content
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1956967341,0,i know i was listenin to bad habit earlier and...
1956967666,1,Layin n bed with a headache ughhh ... waitin o...
1956967696,1,Funeral ceremony ... gloomy friday ...
1956968416,2,We want to trade with someone who has Houston ...
1956968477,3,Re pinging : why didn't you go to prom ? BC my...


Classes are imbalanced as visible

Training/Validation split

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_val, y_train, y_val = train_test_split(
    df.index.values, df.sentiment.values, test_size = 0.15, random_state = 42,
    stratify = df.sentiment.values)

print(X_train.shape, X_val.shape, y_train.shape)

(33354,) (5887,) (33354,)


In [21]:
df['data_type'] = ['not_set'] * df.shape[0]
df.head()

Unnamed: 0_level_0,sentiment,content,data_type
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1956967341,0,i know i was listenin to bad habit earlier and...,not_set
1956967666,1,Layin n bed with a headache ughhh ... waitin o...,not_set
1956967696,1,Funeral ceremony ... gloomy friday ...,not_set
1956968416,2,We want to trade with someone who has Houston ...,not_set
1956968477,3,Re pinging : why didn't you go to prom ? BC my...,not_set


In [22]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['sentiment', 'data_type']).count()


Unnamed: 0_level_0,Unnamed: 1_level_0,content
sentiment,data_type,Unnamed: 2_level_1
0,train,703
0,val,124
1,train,4390
1,val,775
2,train,7342
2,val,1296
3,train,7190
3,val,1269
4,train,1859
4,val,328


Loading Tokenizer and Encoding our data

In [23]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [25]:
df.data_type=='train'


tweet_id
1956967341    True
1956967666    True
1956967696    True
1956968416    True
1956968477    True
              ... 
1753918954    True
1753919001    True
1753919005    True
1753919043    True
1753919049    True
Name: data_type, Length: 39241, dtype: bool

In [26]:
df[df.data_type=='train']


Unnamed: 0_level_0,sentiment,content,data_type
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1956967341,0,i know i was listenin to bad habit earlier and...,train
1956967666,1,Layin n bed with a headache ughhh ... waitin o...,train
1956967696,1,Funeral ceremony ... gloomy friday ...,train
1956968416,2,We want to trade with someone who has Houston ...,train
1956968477,3,Re pinging : why didn't you go to prom ? BC my...,train
...,...,...,...
1753918954,2,,train
1753919001,5,Happy Mothers Day All my love,train
1753919005,5,Happy Mother's Day to all the mommies out ther...,train
1753919043,8,WASSUP BEAUTIFUL ! ! ! FOLLOW ME ! ! PEEP OUT ...,train


In [27]:
df[df.data_type=='train'].content.values


array(['i know i was listenin to bad habit earlier and i started freakin at his part =[',
       'Layin n bed with a headache ughhh ... waitin on your call ...',
       'Funeral ceremony ... gloomy friday ...', ...,
       "Happy Mother's Day to all the mommies out there , be you woman or man as long as you're ' momma ' to someone this is your day !",
       'WASSUP BEAUTIFUL ! ! ! FOLLOW ME ! ! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF . WAT U IN THE VIDEO ! !',
       'bullet train from tokyo the gf and i have been visiting japan since thursday vacation / sightseeing gaijin godzilla'],
      dtype=object)

In [28]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].content.values,
    add_special_tokens=True,
    return_attention_mask=True,
    #pad_to_max_length=True,
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors='pt'
    )


In [29]:
encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].content.values,
    add_special_tokens=True,
    return_attention_mask=True,
   # pad_to_max_length=True,
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors='pt'
    )


For the train

In [30]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].sentiment.values) 


For the validation

In [31]:
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].sentiment.values) 


Create the TensorDataset

In [32]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)


In [33]:
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

BERT Pretrained Model

In [34]:
from transformers import BertForSequenceClassification

In [35]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = len(label_dict),
    output_attentions = False,
    output_hidden_states = False
    )

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Creating data loaders

In [36]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 4
dataloader_train = DataLoader(
    dataset_train,
    sampler = RandomSampler(dataset_train),
    batch_size = batch_size
    )

dataloader_val = DataLoader(
    dataset_val,
    sampler = RandomSampler(dataset_val),
    batch_size = batch_size
    )

Optimizer and Scheduler

In [37]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [38]:
optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8)

In [39]:
epochs = 5

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = len(dataloader_train) * epochs
)

Defining our Performance metrics

In [40]:
import numpy as np
from sklearn.metrics import f1_score

In [41]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis =1 ).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')


In [42]:
def accuracy_per_class(preds, labels):
  label_dict_inverse = {v: k for k, v in label_dict.items()}

  preds_flat = np.argmax(preds, axis=1).flatten()

  preds_flat = np.argmax(preds, axis = 1).flatten()
  labels_flat = labels.flatten()
  #labels_flat = labels.flatten()

  for label in np.unique(labels_flat):
    y_preds = preds_flat[labels_flat == label]
    y_true = labels_flat[labels_flat == label]

    print(f'Class : {label_dict_inverse[label]}')
    print(f'Accuracy:{len(y_pred[y_pred == label])}/{len(y_true)}\n')
    

Creating Training Loop

In [43]:
import random
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [44]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)


cuda


In [45]:
def evaluate(dataloader_val):


  model.eval()

  loss_val_total = 0
  predictions, true_vals = [], []

  for batch in tqdm(dataloader_val):
    batch = tuple(b.to(device) for b in batch)

    inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

    with torch.no_grad():
      outputs = model(**inputs)   



    loss = outputs[0]
    logits = outputs[1]
    loss_val_total += loss.item()
    
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    predictions.append(logits)
    true_vals.append(label_ids)

  loss_val_avg = loss_val_total/len(dataloader_val) 
  predictions = np.concatenate(predictions, axis = 0)

  true_vals = np.concatenate(true_vals, axis= 0)   
  return loss_val_avg, predictions, true_vals


In [46]:

for epoch in tqdm(range(1, epochs+1)):
  
  model.train()
  loss_train_total = 0

  progress_bar = tqdm(dataloader_train,
                      desc = 'Epoch {:1d}'. format(epoch), leave = False, disable = False)
  
  for batch in progress_bar:
    model.zero_grad()
    batch = tuple(b.to(device) for b in batch)
    inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

    outputs = model(**inputs)
    loss = outputs[0]
    loss_train_total += loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    #torch.nn.utils.norm_(model.parameters(), 1.0)

    optimizer.step()
    scheduler.step()

    progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


  tqdm.write('\nEpoch {epoch}')

  loss_train_avg = loss_train_total /len(dataloader_train)
  tqdm.write(f'Training loss : {loss_train_avg}')

  val_loss, predictions, true_vals = evaluate(dataloader_val)
  val_f1 = f1_score_func(predictions, true_vals)
  tqdm.write(f'Validation loss: {val_loss}')
  tqdm.write(f'F1 Score (weighted): {val_f1}')

       

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/8339 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 1.7935617063732825


  0%|          | 0/1472 [00:00<?, ?it/s]

Validation loss: 1.734498940975122
F1 Score (weighted): 0.3522631627156554


Epoch 2:   0%|          | 0/8339 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 1.5893775047708694


  0%|          | 0/1472 [00:00<?, ?it/s]

Validation loss: 1.7571142956654986
F1 Score (weighted): 0.3749482669587979


Epoch 3:   0%|          | 0/8339 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 1.385184557846888


  0%|          | 0/1472 [00:00<?, ?it/s]

Validation loss: 1.8971413272716429
F1 Score (weighted): 0.36641955770767765


Epoch 4:   0%|          | 0/8339 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 1.1818243551738887


  0%|          | 0/1472 [00:00<?, ?it/s]

Validation loss: 2.0845054488568366
F1 Score (weighted): 0.36286862175742884


Epoch 5:   0%|          | 0/8339 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 1.0246353062216627


  0%|          | 0/1472 [00:00<?, ?it/s]

Validation loss: 2.209968839017877
F1 Score (weighted): 0.3542051511752334
