In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
import collections
import os
import random
import re
import string
import warnings
warnings.filterwarnings("ignore")

import nltk.corpus # for stopwords
import numpy as np
import pandas as pd

# visualization lib 
import matplotlib.pyplot as plt 
from PIL import Image
from plotly import graph_objs, express, figure_factory
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import seaborn as sns

# ml
import spacy.util
import tensorflow as tf
import torch
import torch.nn as nn
import tokenizers
import transformers
from sklearn.model_selection import StratifiedKFold

%matplotlib inline

In [None]:
print('TF version', tf.__version__)

In [None]:
train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
submission = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv') 

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.dropna(inplace=True)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# selected_text가 모두 text 데이터의 sub 인지 확인
len(train.apply(lambda x:x.selected_text in x.text, axis=1))

In [None]:
temp = train.groupby('sentiment').count()['text'].reset_index().sort_values(by='text', ascending=False)

In [None]:
temp.style.background_gradient(cmap='Purples')

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='sentiment', data=train)

In [None]:
fig = graph_objs.Figure(graph_objs.Funnelarea(
                    text = temp.sentiment,
                    values = temp.text,
                    title = {
                        "position": "top center",
                        "text": "Funnel-Chart of Sentiment Distribution"
                    }
                ))
fig.show()

In [None]:
# selected_text와 text Jaccard Similiarity로 유사도 추출하기
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


In [None]:

result_jaccard = []

for idx, row in train.iterrows():
    sent1 = row.text
    sent2 = row.selected_text
    
    jaccard_score = jaccard(sent1, sent2)
    result_jaccard.append([sent1, sent2, jaccard_score])
    
jaccard = pd.DataFrame(result_jaccard, columns=["text", "selected_text", "jaccard_score"])
train = train.merge(jaccard, how='outer')

In [None]:
train['num_word_selected'] = train['selected_text'].apply(lambda x: len(str(x).split()))
train['num_word_text'] = train['text'].apply(lambda x: len(str(x).split()))
train['difference_in_words'] = train['num_word_text'] - train['num_word_selected']

In [None]:
train.head()

In [None]:
# selected text와 text의 word 개수 비교
hist_data = [train['num_word_selected'], train['num_word_text']]

fig, axes = plt.subplots(figsize=(12, 6))
sns.countplot(train['num_word_selected'], ax=axes, color='blue', alpha=0.3, label='selected_text')
sns.countplot(train['num_word_text'], ax=axes, color='red', alpha=0.3, label='text')
axes.legend()
fig.show()

In [None]:
# positive, negative 의 selected_text와 text의 단어 개수 차이 
plt.figure(figsize=(12, 6))
p1 = sns.kdeplot(
        train[train['sentiment'] == 'positive']['difference_in_words'],
        shade=True,
        color='b',
        label='positive').set_title('Kernel Distribution of Difference in Number of words')
p2 = sns.kdeplot(
        train[train['sentiment'] == 'negative']['difference_in_words'],
        shade=True,
        color='r',
        label='positive')

In [None]:
# neutral의 selected_text와 text의 단어개수 차이 
plt.figure(figsize=(12, 6))
sns.distplot(train[train['sentiment'] == 'neutral']['difference_in_words'], kde=False)

In [None]:
# positive, negative의 jaccard score 차이
plt.figure(figsize=(12, 6))
p1 = sns.distplot(
        train[train['sentiment'] == 'positive']['jaccard_score'],
#         shade=True,
        color='b',
        label='positive'
        ).set_title('KDE of Jaccard Scores across different Sentiments')
p2 = sns.distplot(
        train[train['sentiment'] == 'negative']['jaccard_score'],
#         shade=True,
        color='r',
        label='negative')
plt.legend(labels=['positive', 'negative'])

In [None]:
plt.figure(figsize=(12, 6))
p1 = sns.kdeplot(
        train[train['sentiment'] == 'positive']['jaccard_score'],
        shade=True,
        color='b',
        label='positive'
        ).set_title('KDE of Jaccard Scores across different Sentiments')
p2 = sns.kdeplot(
        train[train['sentiment'] == 'negative']['jaccard_score'],
        shade=True,
        color='r',
        label='negative')
plt.legend(labels=['positive', 'negative'])

In [None]:
MAX_LEN = 96

## Modeling  
- roBerta
- Bart

### roBERTa (tensorflow)


In [None]:
MAX_LEN = 96
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
                vocab = PATH + 'vocab-roberta-base.json',
                merges = PATH + 'merges-roberta-base.txt',
                lowercase = True,
                add_prefix_space = True
            )


In [None]:
unique_sentiment = train.sentiment.unique()
print(unique_sentiment)
sentiment_id = collections.defaultdict(int)
for idx, sentiment in enumerate(unique_sentiment):
    sentiment_id[sentiment] = idx


In [None]:
shape0 = train.shape[0]
input_ids = np.ones((shape0, MAX_LEN), dtype='int32')
attention_mask = np.zeros((shape0, MAX_LEN), dtype='int32')
token_type_ids = np.zeros((shape0, MAX_LEN), dtype='int32')
start_tokens = np.zeros((shape0, MAX_LEN), dtype='int32')
end_tokens = np.zeros((shape0, MAX_LEN), dtype='int32')

for k in range(shape0):
    # text2에서 text1의 위치를 찾아 chars에 해당 위치에 1이라고 mark
    text1 = " " + " ".join(train.loc[k, 'text'].split())
    text2 = " ".join(train.loc[k, 'selected_text'].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx: idx + len(text2)] = 1
    if text1[idx - 1] == ' ':
        chars[idx - 1] = 1
    # text1 문장 tokenizer encoding하여 저장 
    enc = tokenizer.encode(text1)
#     print(enc)
    # offset에 enc에 저장되어 있는 각 단어 길이 저장 (단어 맨처음 char에 ' '포함되어 카운팅됨)
    offsets = []
    idx = 0
    for t in enc.ids:
        w = tokenizer.decode([t])
#         print(w)
        offsets.append((idx, idx + len(w)))
        idx += len(w)
        
    # 단어 인덱싱
    toks = []
#     print(chars)
#     print(text2)
    for i, (a, b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(i)
#         print(chars[a:b], sm)
#     print(toks)
#     print('')
#     print('')
    s_tok = sentiment_id[train.loc[k, 'sentiment']]
#     print('sentiment_id: ', s_tok)
#     print('before input_ids: ', input_ids)
    input_ids[k, :len(enc.ids) + 5] = [0] + enc.ids + [2, 2] + [s_tok] + [2]
#     print('after input_ids: ', input_ids)
#     print('before attention_mask: ', attention_mask)
    attention_mask[k, :len(enc.ids)+5] = 1
#     print('after attention_mask: ', attention_mask)
    if len(toks) > 0:
        start_tokens[k, toks[0]+1] = 1
        end_tokens[k, toks[-1]+1] = 1
        

In [None]:
print(train['text'][1])
print(input_ids[1])
print(attention_mask[1])
print(sentiment_id[1])
print(start_tokens[1])
print(end_tokens[1])

In [None]:
enc = tokenizer(train['text'][1])
enc
# print(tokenizer.decode(enc['input_ids']))

In [None]:
def build_roberta():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    
    config = transformers.RobertaConfig.from_pretrained(PATH + 'config-roberta-base.json')
    bert_model = transformers.TFRobertaModel.from_pretrained(PATH + 'pretrained-roberta-base.h5', config=config)
    
    x = bert_model(ids, attention_mask=att, token_type_ids=tok)
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0])
    x1 = tf.keras.layers.Conv1D(1,1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0])
    x2 = tf.keras.layers.Conv1D(1,1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)
    
    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1, x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    
    return model


    
    

In [None]:
model = build_roberta()
model.summary()

In [None]:
VER = 'v0'
DISPLAY=1
val_start = np.zeros((input_ids.shape[0], MAX_LEN))
val_end = np.zeros((input_ids.shape[0], MAX_LEN))

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
best_model = 0
history = []
for fold, (idx_train, idx_val) in enumerate(skf.split(input_ids, train.sentiment.values)):
    print(fold, (idx_train, idx_val))
    tf.keras.backend.clear_session()
    model = build_roberta()
    sv = tf.keras.callbacks.ModelCheckpoint(
            '/kaggle/working/%s-roberta-%i.h5'%(VER,fold),
            monitor='val_loss',
            verbose=1,
            save_best_only=True,
            save_weight_only=True,
            mode='auto',
            save_freq='epoch'
            )
    
    history.append(
        model.fit([input_ids[idx_train,], attention_mask[idx_train,], token_type_ids[idx_train, ]], [start_tokens[idx_train,], end_tokens[idx_train,]],
                  epochs=3,
                  batch_size=32,
                  verbose=1,
                  callbacks=[sv],
                  validation_data=([input_ids[idx_val,], attention_mask[idx_val,], token_type_ids[idx_val,]], [start_tokens[idx_val,], end_tokens[idx_val,]])))
      
    model.load_weights('/kaggle/working/%s-roberta-%i.h5'%(VER,fold))
    val_start[idx_val,], val_end[idx_val,] = model.predict(
                                [input_ids[idx_val,],attention_mask[idx_val,],token_type_ids[idx_val,]],verbose=1)

In [None]:
plt.plot(history[0].history['loss'])
plt.plot(history[0].history['val_loss'])
plt.title('Model Loss')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

### **BART**

In [None]:
from transformers import BartModel, BartConfig
# Initializing a BART facebook/bart-large style configuration
configuration = BartConfig()
# Initializing a model from the facebook/bart-large style configuration
model = BartModel(configuration)
# Accessing the model configuration
configuration = model.config

In [None]:
from transformers import BartTokenizer, BartModel
import torch
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartModel.from_pretrained('facebook/bart-large')
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartModel.from_pretrained('facebook/bart-large')

In [None]:
from transformers import BartTokenizer, BartModel, AdamW
import torch.nn as nn


class BartQA(nn.Module):
    def __init__(self, bart, config):
        super(BartQA, self).__init__()
        self.bart = bart
        self.qa = nn.Linear(config.hidden_size, 2)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_ids, attention_mask):
        output = self.bart(input_ids, attention_mask=attention_mask)
        qa_output = self.qa(output[0])
        print('output[0].shape: ', output[0].shape)
        print('qa_output.shape: ', qa_output.shape)
        start, end = qa_output.split(1, dim=-1)
        print('start.shape, end.shape: ', start.shape, end.shape)
        start_logits = start.squeeze(-1)
        end_logit = end.squeeze(-1)
        print('start_logits.shape, end_logit.shape: ', start_logits.shape, end_logit.shape)
        return start_logits, end_logit



bartqa = BartQA(model, model.config)
bartqa.to('cuda')



In [None]:
from torch.utils.data import  TensorDataset, DataLoader

In [None]:
input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
attention_mask_tensor = torch.tensor(attention_mask, dtype=torch.float32)
start_tokens_tensor = torch.tensor(start_tokens, dtype=torch.long)
end_tokens_tensor = torch.tensor(end_tokens, dtype=torch.long)

In [None]:
train_data = TensorDataset(input_ids_tensor, attention_mask_tensor, start_tokens_tensor, end_tokens_tensor)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=2)

In [None]:
device = 'cuda'

model.train()
total_loss = 0
total_preds = []
for step, batch in enumerate(train_dataloader):
    # progress update after every 50 batches.
    if step % 50 == 0 and not step == 0:
        print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    batch = [r.to(device) for r in batch]
    sent_id, mask, start, end = batch
    print(len(sent_id), len(mask), len(start), len(end))
    model.zero_grad()

    preds = bartqa(sent_id, mask)
    print(preds[0].shape, start.shape)
    print(preds[1].shape, end.shape)
    loss_fct = nn.CrossEntropyLoss(ignore_index=1)
    start_loss = loss_fct(preds[0], start)
    end_loss = loss_fct(preds[1], end)
    loss = (start_loss + end_loss) / 2
    total_losss = total_loss + loss.item()

    loss.backward()

    nn.utils.clip_grad_norm(model.paramters(), 1.0)

    optimizer.step()

    preds = preds.detach().cpu().numpy()

    total_preds.append(preds)

    avg_loss = total_loss / len(train_dataloader)

    total_preds = np.concatenate(total_preds, axis=0)

    print(avg_loss, total_preds)