In [23]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/twitterdata/finalSentimentdata2.csv
/kaggle/input/bert-base-uncased/vocab.txt
/kaggle/input/bert-base-uncased/pytorch_model.bin
/kaggle/input/bert-base-uncased/config.json


In [24]:
data = pd.read_csv('/kaggle/input/twitterdata/finalSentimentdata2.csv')

In [25]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentiment,text
0,3204,sad,agree the poor in india are treated badly thei...
1,1431,joy,if only i could have spent the with this cutie...
2,654,joy,will nature conservation remain a priority in ...
3,2530,sad,coronavirus disappearing in italy show this to...
4,2296,sad,uk records lowest daily virus death toll since...


In [26]:
import nltk
import re
import string


def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [27]:
data['text']=data['text'].apply(lambda x: clean_text(x))

In [28]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [29]:
data['text']=data['text'].apply(lambda x: remove_emoji(x))

In [30]:
def encode_labels(sentiment):
    if sentiment == 'joy':
        return 1
    else:
        return 0

In [31]:
data['sentiment']=data['sentiment'].apply(lambda x: encode_labels(x))

In [32]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentiment,text
0,3204,0,agree the poor in india are treated badly thei...
1,1431,1,if only i could have spent the with this cutie...
2,654,1,will nature conservation remain a priority in ...
3,2530,0,coronavirus disappearing in italy show this to...
4,2296,0,uk records lowest daily virus death toll since...


In [33]:
import torch
import torch.nn as nn

In [34]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader

In [35]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f958b5f35d0>

In [36]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [37]:
print(device)

cuda:0


In [38]:
BERT_UNCASED = '/kaggle/input/bert-base-uncased'

In [39]:
tokenizer = BertTokenizer.from_pretrained(BERT_UNCASED)

In [40]:
tweet = 'Please practice social distancing amidst the pandemic.'
tokens = tokenizer.tokenize(tweet)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f' Sentence: {tweet}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

 Sentence: Please practice social distancing amidst the pandemic.
   Tokens: ['please', 'practice', 'social', 'di', '##stan', '##cing', 'amidst', 'the', 'pan', '##de', '##mic', '.']
Token IDs: [3531, 3218, 2591, 4487, 12693, 6129, 17171, 1996, 6090, 3207, 7712, 1012]


In [41]:
encoding = tokenizer.encode_plus(
tweet,
max_length=32,
add_special_tokens=True,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt')

encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [47]:
encoding['input_ids'].flatten()

tensor([  101,  3531,  3218,  2591,  4487, 12693,  6129, 17171,  1996,  6090,
         3207,  7712,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0])

In [48]:
encoding['attention_mask'].flatten()

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

In [42]:
token_lengths=[]
for text in data.text:
    token_lengths.append(len(tokenizer.encode(text, max_length=512)))

print(max(token_lengths))

92


In [43]:
MAX_LENGTH = 100

In [100]:
class Covid19Tweet(Dataset):
    
    def __init__(self, tweets, sentiment, tokenizer, max_len):
        
        
        self.tweets = tweets
        self.sentiment = sentiment
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.tweets)
    def __getitem__(self, item):
        
        tweets = str(self.tweets[item])
        sentiment = self.sentiment[item]
        encoding = self.tokenizer.encode_plus(
        tweets,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt')
        return {
        'tweet_text': tweets,
         'input_ids': encoding['input_ids'].flatten(),
         'attention_mask': encoding['attention_mask'].flatten(),
         'sentiments': torch.tensor(sentiment, dtype=torch.long)
          }

In [102]:
from sklearn.model_selection import train_test_split

In [103]:
train, val = train_test_split(
  data,
  test_size=0.1,
  random_state=RANDOM_SEED
)

In [50]:
train.shape, val.shape

((2472, 3), (618, 3))

In [105]:
def create_data_loader(data, tokenizer, max_len, batch_size):
    
    ds = Covid19Tweet(tweets=data.text.to_numpy(),
    sentiment=data.sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len)
    return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4)


BATCH_SIZE = 32
train_data_loader = create_data_loader(train, tokenizer, MAX_LENGTH, BATCH_SIZE)
val_data_loader = create_data_loader(val, tokenizer, MAX_LENGTH, BATCH_SIZE)

In [55]:
df = next(iter(train_data_loader))
df.keys()

dict_keys(['tweet', 'input_ids', 'attention_mask', 'sentiments'])

In [58]:
print('tweet : ', df['tweet'][1])
print('input_ids : ', df['input_ids'][1])
print('attention_mask : ', df['attention_mask'][1])
print('sentiments : ', df['sentiments'][1])

tweet :  you forgot the third india one that is try their best to spread the last and the most important one forth india those protecting the ones spreading this disease
input_ids :  tensor([  101,  3531,  3218,  2591,  4487, 12693,  6129, 17171,  1996,  6090,
         3207,  7712,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0])
attention_mask :  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
sentiments :  tensor(0)


In [60]:
bert_model = BertModel.from_pretrained(BERT_UNCASED)

In [106]:
class SentimentClassifier(nn.Module):
    
    def __init__(self, n_classes):
        
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    def forward(self, input_ids, attention_mask):
        
        _, pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
        output = self.drop(pooled_output)
        return self.out(output)

In [93]:
n_classes = 2 # positive, negative
model = SentimentClassifier(n_classes)
model = model.to(device)

In [94]:
input_ids = df['input_ids'].to(device)
attention_mask = df['attention_mask'].to(device)

In [95]:
import torch.nn.functional as F


F.softmax(model(input_ids, attention_mask), dim=1)

tensor([[0.4931, 0.5069],
        [0.3982, 0.6018],
        [0.6037, 0.3963],
        [0.5324, 0.4676],
        [0.4075, 0.5925],
        [0.4295, 0.5705],
        [0.6103, 0.3897],
        [0.4982, 0.5018],
        [0.4552, 0.5448],
        [0.6315, 0.3685],
        [0.4313, 0.5687],
        [0.5429, 0.4571],
        [0.3773, 0.6227],
        [0.4825, 0.5175],
        [0.4351, 0.5649],
        [0.5248, 0.4752],
        [0.5042, 0.4958],
        [0.4639, 0.5361],
        [0.4922, 0.5078],
        [0.4194, 0.5806],
        [0.3633, 0.6367],
        [0.4049, 0.5951],
        [0.5927, 0.4073],
        [0.4585, 0.5415],
        [0.4017, 0.5983],
        [0.4702, 0.5298],
        [0.4054, 0.5946],
        [0.4638, 0.5362],
        [0.4530, 0.5470],
        [0.5254, 0.4746],
        [0.3918, 0.6082],
        [0.3992, 0.6008]], device='cuda:0', grad_fn=<SoftmaxBackward>)

In [66]:
model.parameters

<bound method Module.parameters of SentimentClassifier(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [107]:
EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [108]:
def train_epoch(model,data_loader,loss_fn,optimizer,device,scheduler, n_examples):  
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["sentiments"].to(device)
        outputs = model(input_ids=input_ids,attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

In [109]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["sentiments"].to(device)
            outputs = model(input_ids=input_ids,attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
        return correct_predictions.double() / n_examples, np.mean(losses)
            

In [110]:
%%time
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    train_acc, train_loss = train_epoch(model,train_data_loader,loss_fn,optimizer,device,scheduler,len(train))
    print(f'Train loss {train_loss} accuracy {train_acc}')
    val_acc, val_loss = eval_model(model,val_data_loader,loss_fn,device,len(val))
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    if val_acc > best_accuracy:
        
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

Epoch 1/10
----------
Train loss 0.4187071273724238 accuracy 0.823804386911183
Val   loss 0.22082368060946464 accuracy 0.9158576051779935

Epoch 2/10
----------
Train loss 0.2161241812395981 accuracy 0.9241280115066522
Val   loss 0.21779102105647324 accuracy 0.9288025889967638

Epoch 3/10
----------
Train loss 0.12591336070206657 accuracy 0.9661992089176555
Val   loss 0.20893345195800067 accuracy 0.9449838187702266

Epoch 4/10
----------
Train loss 0.06210743753646297 accuracy 0.9859762675296656
Val   loss 0.2355087656993419 accuracy 0.9449838187702266

Epoch 5/10
----------
Train loss 0.04543946995453416 accuracy 0.9906508450197771
Val   loss 0.25116687421686945 accuracy 0.9449838187702266

Epoch 6/10
----------
Train loss 0.03228857728716885 accuracy 0.9938870909744696
Val   loss 0.27514614432584494 accuracy 0.9385113268608415

Epoch 7/10
----------
Train loss 0.022422273702191554 accuracy 0.9960445882775979
Val   loss 0.29156487630680206 accuracy 0.941747572815534

Epoch 8/10
------

In [115]:
sad_sample_tweet = 'It is very sad to see the corona pandemic increasing at such an alarming rate'

In [121]:
happy_sample_tweet = 'It is amazing to see that New Zealand reaches 100 days without Covid transmission!'

In [116]:
encoded_review = tokenizer.encode_plus(sad_sample_tweet,max_length=MAX_LENGTH,add_special_tokens=True,
                                           return_token_type_ids=False,pad_to_max_length=True,return_attention_mask=True,
                                           return_tensors='pt')

In [122]:
encoded_happy_review = tokenizer.encode_plus(happy_sample_tweet,max_length=MAX_LENGTH,add_special_tokens=True,
                                           return_token_type_ids=False,pad_to_max_length=True,return_attention_mask=True,
                                           return_tensors='pt')

In [120]:
classes = ['sad', 'happy']


input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
print('Review text :{}'.format(sad_sample_tweet))
print('Sentiment :{}'.format(classes[prediction]))

Review text :It is very sad to see the corona pandemic increasing at such an alarming rate
Sentiment :sad


In [123]:
input_ids = encoded_happy_review['input_ids'].to(device)
attention_mask = encoded_happy_review['attention_mask'].to(device)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
print('Review text : {}'.format(happy_sample_tweet))
print('Sentiment : {}'.format(classes[prediction]))

Review text :It is amazing to see that New Zealand reaches 100 days without Covid transmission!
Sentiment :happy
