In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import copy
import math
import transformers
from transformers import AdamW
import nltk

In [None]:
train = pd.read_csv("../input/tweet-sentiment-extraction/train.csv")
test = pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
sample_submission = pd.read_csv("../input/tweet-sentiment-extraction/sample_submission.csv")

In [None]:
train.dropna(inplace=True)

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
train["text_raw"]  = train['text']
train['selected_text_raw'] = train['selected_text']
test['text_raw'] = test['text']


train['text'] = train['text'].apply(lambda x:clean_text(x))
train['selected_text'] = train['selected_text'].apply(lambda x:clean_text(x))
test['text'] = test['text'].apply(lambda x:clean_text(x))

In [None]:
train_positive = train.loc[(train.sentiment == 'positive')]
train_neutral = train.loc[(train.sentiment == 'neutral')]
train_negative = train.loc[(train.sentiment == 'negative')]

test_positive = test.loc[(test.sentiment == 'positive')]
test_neutral = test.loc[(test.sentiment == 'neutral')]
test_negative = test.loc[(test.sentiment == 'negative')]

In [None]:
train_all = [train_positive, train_neutral, train_negative]
test_all = [test_positive, test_neutral, test_negative]

In [None]:
len(train_positive)

In [None]:
len(train_all)

In [None]:
test_positive.head()

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    if (len(a) + len(b) - len(c)) == 0:
        return 1
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
# class BertModel(torch.nn.Module):
#     def __init__(self,UNCASED, outputSize, droupout, std):
#         super(BertModel, self).__init__()
#         self.config = transformers.BertConfig.from_pretrained(UNCASED, output_hidden_states=True)
#         self.bert_model = transformers.BertModel.from_pretrained(UNCASED, config=self.config)
#         self.drop_out = torch.nn.Dropout(droupout)
#         self.con_model = torch.nn.Conv1d(in_channels=768, out_channels=outputSize, kernel_size=1)
#         self.linear_model1 = torch.nn.Linear(768, 128)
#         self.relu = torch.nn.ReLU()
#         self.linear_model2 = torch.nn.Linear(128, outputSize)
#         torch.nn.init.normal_(self.linear_model1.weight, std=std)
#         torch.nn.init.normal_(self.linear_model2.weight, std=std)
#         self.sigmoid = torch.nn.Sigmoid()

#     def forward(self, input_ids, attention_mask):
            
#         _,_,hidden_states = self.bert_model(input_ids, attention_mask=attention_mask.float())
#         embedding_output = hidden_states[0]
#         attention_hidden_states = hidden_states[1:]
#         summed_last_4_layers = torch.stack(attention_hidden_states[-4:]).sum(0)
#         sentence_embedding = torch.mean(summed_last_4_layers, dim=1)

#         out = self.drop_out(sentence_embedding)
#         out = self.linear_model1(out)
#         out = self.relu(out)
#         out = self.linear_model2(out)
#         out = self.sigmoid(out)
#         return out

In [None]:
def from_predicted_positon_to_text(predicted, threshold, padded_tokens, raw_text, tokenizer):  
    # predicted, threshold, padded_tokens, tokenizer  = eval_predicted, threshold, X_train_all_input_ids[-num_of_val:,:], tokenizer
    predicted[predicted >= threshold] = 1
    predicted[predicted < threshold] = 0

    tokens_matrix = []
    decode_matrix = []
    for i in range(padded_tokens.shape[0]):
    #         print(padded_tokens[i])
        decode = tokenizer.decode(padded_tokens[i])
        tokens_matrix.append(tokenizer.tokenize(decode))
        decode_matrix.append(" ".join(tokenizer.tokenize(decode)))
#     print("tokens_matrix:", tokens_matrix)
    #     print("tokens_matrix: ", tokens_matrix[:5])
    ##################################################### Evaluation: Generate Token Number of Predicted Text  #####################################################

    index_matrix = []
    for i in range(len(predicted)):
        index = [i for i,p in enumerate(predicted[i, :].tolist()) if p == 1]
        index_matrix.append(index)
    print("index_matrix", index_matrix[:5])

    ##################################################### Evaluation: Generate Token Number of First and Last Predicted Words  #####################################################

    first_last_words = []
    #     padded_tokens = X_val
    for i, j in zip(tokens_matrix, index_matrix):
        if 0 in j:
            j.remove(0)
        for k in j[::-1]:
            if i[k] in ["[SEP]", "[PAD]"]:
                j.remove(k)
        if len(j) == 0:
            first_last_words.append(["", ""])
        else:
            last = j[-1]
            first = j[0]
            if i[first] == i[last]:
                first_last_words.append(["",i[last]])
            else:
                first_last_words.append([i[first],i[last]])

    predicted_val_text = []
    for f_l, s in zip(first_last_words, decode_matrix):
        predicted_val_text.append(re.findall(f_l[0]+".+"+ f_l[1], s)[0].replace('[CLS]', '').replace('CLS]', '').replace('[SEP]', '').replace('[PAD]', '').replace('[S', '').replace('[P', '').replace('AD', '').replace(' ##', '').strip())
    print(predicted_val_text[:5])
    return predicted_val_text

In [None]:
def from_predicted_positon_to_text_0(predicted, threshold, padded_tokens,raw_text, tokenizer): 
#     predicted, threshold, padded_tokens,raw_text, tokenizer  = eval_predicted, 0.5, X_train_all_input_ids[-num_of_val:,:],X_train_text_raw[-num_of_val:], tokenizer

    predicted[predicted >= threshold] = 1
    predicted[predicted < threshold] = 0

    selected_tokens = torch.from_numpy(padded_tokens).to(device) * predicted
    
    predicted_tokens = []
    for i in range(selected_tokens.shape[0]):
        predicted_tokens.append(selected_tokens[i][selected_tokens[i] != 0])

    predicted_text = []
    for i in range(len(predicted_tokens)):
        pt = tokenizer.decode(predicted_tokens[i]).replace("[CLS]", "").replace("[SEP]", "")
        predicted_text.append(pt)
    return predicted_text

In [None]:
class BertModel(torch.nn.Module):
    def __init__(self,UNCASED, outputSize, droupout, std):
        super(BertModel, self).__init__()
        self.config = transformers.BertConfig.from_pretrained(UNCASED, output_hidden_states=True)
        self.bert_model = transformers.BertModel.from_pretrained(UNCASED, config=self.config)
        self.drop_out = torch.nn.Dropout(droupout)
        self.con_model1 = torch.nn.Conv1d(in_channels=768, out_channels=256, kernel_size=1)        
        self.con_model2 = torch.nn.Conv1d(in_channels=256, out_channels=64, kernel_size=1)        
        self.con_model3 = torch.nn.Conv1d(in_channels=64, out_channels=outputSize, kernel_size=1)


#         self.linear_model = torch.nn.Linear(768, outputSize)
        self.sigmoid = torch.nn.Sigmoid()
#         torch.nn.init.normal_(self.linear_model.weight, std=std)

    def forward(self, input_ids, attention_mask):
            
        last_hidden_states = self.bert_model(input_ids, attention_mask=attention_mask.float())
        last_hidden_states = last_hidden_states[0].permute(0,2,1)
#         print(torch.mean(last_hidden_states))
        out = self.drop_out(last_hidden_states)
        out = self.con_model1(out)
        out = self.con_model2(out)
        out = self.con_model3(out)
        out = torch.sum(out, dim=2)
        out = self.sigmoid(out)
#         print(out[1])
        return out

In [None]:
# del model
torch.cuda.empty_cache()
# 1 positive, 2 neutral, 3 negative
category = ["positive", "neutral", "negative"]
color = ['b', 'r', 'g']
max_sequence_length = 32
val_frac = 0.25
num_of_val = 1000
learningRate = 3e-5 # 0.01
max_length = max_sequence_length
outputSize = max_sequence_length
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# epochs = [3000, 2000, 5000]
epochs = [10, 1, 12]
predicted_text = {}

VOCAB='/kaggle/input/bertbaseuncased/vocab.txt' # your path for model and vocab 
UNCASED='/kaggle/input/bertbaseuncased/'
std = 0.02
droupout = 0.1
batch_size = 16
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
tokens_dict = {}

##################################################### For Loop #####################################################    

for i in range(len(train_all)):
    if i == 1:
        threshold = 0
    else:
        threshold = 0.5
    ##################################################### Model Parameters #####################################################
    tokenizer = transformers.BertTokenizer.from_pretrained(VOCAB)
    
#     for k, v in list(tokenizer.vocab.items()):
#         if k[1:-1] not in ["SEP", "MASK", "CLS", "PAD"]:
#             tokens_dict[k[1:-1]] = v
            
    model = BertModel(UNCASED, outputSize, droupout, std)
    model.to(device)
    criterion = torch.nn.MSELoss()
    param_optimizer = list(model.named_parameters())
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=learningRate)
#     optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)

    
    ##################################################### Input Preparation #####################################################
    train_data = train_all[i]
    test_data = test_all[i]
    
    ##################################################### Get Text, Selected Text #####################################################
    X_train_text_raw = train_data.loc[:, "text_raw"].tolist()
    y_train_text_raw = train_data.loc[:, "selected_text_raw"].tolist()
    X_test_text_raw = test_data.loc[:, "text_raw"].tolist()
    
    
    X_train_text = train_data.loc[:, "text"].tolist()
    y_train_text = train_data.loc[:, "selected_text"].tolist()
    X_test = test_data.loc[:, ["textID", "text"]]
    ##################################################### Tokenizing #####################################################
    X_train_tokens = [tokenizer.encode(t, add_special_tokens=True, max_length = max_length) for t in X_train_text]    
    y_train_tokens = [tokenizer.encode(t, add_special_tokens=True, max_length = max_length) for t in y_train_text]
    X_test_tokens = [tokenizer.encode(t, add_special_tokens=True, max_length = max_length) for t in X_test.text]
    ##################################################### Padding #####################################################

    X_train_all_input_ids = np.array([i + [0]*(max_length - len(i)) for i in X_train_tokens])
    y_train_all_input_ids = np.array([i + [0]*(max_length - len(i)) for i in y_train_tokens])
    X_test_input_ids = np.array([i + [0]*(max_length - len(i)) for i in X_test_tokens])
    ##################################################### Attention Masks #####################################################
    X_train_all_attention_mask = np.where(X_train_all_input_ids != 0, 1, 0)
    y_train_all_attention_mask = np.where(y_train_all_input_ids != 0, 1, 0)
    X_test_attention_mask = np.where(X_test_input_ids != 0, 1, 0)
#     print(X_train_all_attention_mask[:5])
    ##################################################### Convert Training Label to Position (0/1) #####################################################
    
    y_train_bool = []
    for j in range(len(y_train_all_input_ids)):
        a = [1 if x > 0 and x in y_train_all_input_ids[j,:] and (x not in [101, 102]) else 0 for x in X_train_all_input_ids[j,:].tolist()]
        y_train_bool.append(a)
    y_train_bool = np.array(y_train_bool)
    
    # print(y_train_bool[:5])
    
    ##################################################### Split into Train and Evaluation Data #####################################################
    

    num_of_val = num_of_val + (len(X_train_all_input_ids) - num_of_val)%batch_size
    num_of_train = len(X_train_all_input_ids) - num_of_val

#     num_of_train = math.floor(len(X_train_input_ids) * (1-val_frac))
#     num_of_val = len(X_train_input_ids) - num_of_train   

#     X_train = X_train_input_ids[:num_of_train,:]
#     X_train_attention_mask = X_train_attention_mask[:num_of_train,:]
#     y_train = y_train_bool[:num_of_train,:]

    X_val = X_train_all_input_ids[-num_of_val:,:]
    X_val_attention_mask = X_train_all_attention_mask[-num_of_val:,:]
    y_val = y_train_bool[-num_of_val:,:]
    
    training_loss = []
    
        
        ##################################################### Training #####################################################
    model.train() 
    for epoch in range(epochs[i]):
        for k_fold in range(int(num_of_train/batch_size)):
            
            X_train = X_train_all_input_ids[k_fold*batch_size:(k_fold+1)*batch_size,:]
            X_train_attention_mask = X_train_all_attention_mask[k_fold*batch_size:(k_fold+1)*batch_size,:]
            y_train = y_train_bool[k_fold*batch_size:(k_fold+1)*batch_size,:]
            
            # Converting inputs and labels to Variable
            X_train =  Variable(torch.from_numpy(X_train).to(device))
            X_train_attention_mask = Variable(torch.from_numpy(X_train_attention_mask).to(device))
            y_train = Variable(torch.from_numpy(y_train).to(device))

            outputs = model(X_train, X_train_attention_mask)
            loss = criterion(outputs.float(), y_train.float())
            training_loss.append(loss.item())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        
            if k_fold%100 == 0:
                print('{}-th category, total {} fold, now {}, epoch {}, loss {}'.format(i, int(num_of_train/batch_size), k_fold, epoch, loss.item()))

    ##################################################### Drawing Training Loss #####################################################
    
    # draw training loss
    plt.plot(range(len(training_loss)), training_loss, color[i], label='training loss')
    
    ##################################################### Evaluation:  Generate Selected Positions #####################################################
    model.eval()
    with torch.no_grad(): # we don't need gradients in the testing phase

#         if torch.cuda.is_available():
        X_val =  Variable(torch.from_numpy(X_val).to(device))
        X_val_attention_mask =  Variable(torch.from_numpy(X_val_attention_mask).to(device))
        y_val =  Variable(torch.from_numpy(y_val).to(device))

        eval_predicted = model(X_val, X_val_attention_mask)
    
    predicted_val_text = from_predicted_positon_to_text(eval_predicted, threshold, X_train_all_input_ids[-num_of_val:,:], X_train_text_raw[-num_of_val:], tokenizer) 

    ##################################################### Jaccard Scores #####################################################
    jaccard_score_list = []
    for str1, str2 in zip(predicted_val_text, y_train_text[-num_of_val:]):
        jaccard_score_list.append(jaccard(str1, str2))
    result = pd.Series(jaccard_score_list)
    print(result.describe())
    
    ##################################################### Prediction on Test Data #####################################################
    
    with torch.no_grad(): # we don't need gradients in the testing phase
#         if torch.cuda.is_available():
        X_test_input_ids_ =  Variable(torch.from_numpy(X_test_input_ids).to(device))
        X_test_attention_mask_ =  Variable(torch.from_numpy(X_test_attention_mask).to(device))
        
        test_predicted = model(X_test_input_ids_, X_test_attention_mask_)
        
    predicted_test_text = from_predicted_positon_to_text(test_predicted, threshold, X_test_input_ids, X_test_text_raw, tokenizer) 
    for p, idx in zip(predicted_test_text, X_test.textID.tolist()):
        predicted_text[idx] = p

In [None]:
# num = -num_of_val
# for i, j,k,l, m, n,o in zip(eval_predicted, predicted_val_text, X_train_text[num:], X_train_all_input_ids[num:],  y_train_text[num:], y_train_all_input_ids[num:], y_train_bool[num:,:]):
#     print("i --- ", i)
#     print("j --- ", j)
#     print("k --- ", k)
#     print("l --- ", l)
#     print("m --- ", m)
#     print("n --- ", n)
#     print("o --- ", o)
#     print()

In [None]:
# tokenizer.decode([2034 , 2028 , 2003 , 3976 , 3238  ,1010 , 2021 , 1996 , 3177 , 2080 , 2003,
#   2307 , 1010 , 2205  ,1012 ,13970 ,12269 , 2000  ,3566   ])

# tokenizer.encode("[PAD]")

In [None]:
sample_submission_df = pd.DataFrame()
for idx, row in sample_submission.iterrows():
    row["selected_text"] = predicted_text[row["textID"]]
    row_frame = row.to_frame().T
    if sample_submission_df.empty:
        sample_submission_df = row_frame
    else:
        sample_submission_df = sample_submission_df.append(row_frame)
display(sample_submission_df)

In [None]:
sample_submission_df.to_csv("submission.csv", index = False)