In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv').dropna().reset_index(drop=True)
df.head()

In [None]:
import os
import re
import string
import random
import torch
import torch.nn as nn
import transformers
from sklearn import model_selection
from tqdm import tqdm
from tokenizers.implementations import BertWordPieceTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
# import tokenizers

In [None]:
class Settings:
#     PROJ_NAME = 'Text_Extraction_AKA_Question_Answering_BERT'
#     root_path = os.getcwd().split(PROJ_NAME)[0] + PROJ_NAME + "\\"
#     APPLICATION_PATH = root_path + "backend\\services\\text_extraction\\application\\"
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 16
    EPOCHS = 10
    RANDOM_STATE = 42
    # MODEL_PATH = 'entity_model.bin'
    TRAIN_NUM_WORKERS = 4
    VAL_NUM_WORKERS = 2

    # training data directory
    TRAIN_DATA = '/kaggle/input/tweet-sentiment-extraction/train.csv'

    # test data directory
    TEST_DATA = '/kaggle/input/tweet-sentiment-extraction/test.csv'

    # weights path
    WEIGHTS_PATH ="text_extraction_model.bin"

    # BERT path
    BERT_PATH = "/kaggle/input/bert-base-uncased/"

    # vocab path
    VOCAB_PATH = "/kaggle/input/bert-base-uncased/vocab.txt"

    # setting up logs path
#     LOGS_DIRECTORY = root_path + "backend\\services\\text_extraction\\logs\\logs.txt"

    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_dim = 768
    output_dim = 2
    bert_model_name = 'bert-base-uncased'

    TOKENIZER = BertWordPieceTokenizer(
                   f"{BERT_PATH}/vocab.txt",
                   lowercase=True
      )

    # TOKENIZER = transformers.BertTokenizer.from_pretrained(
    #     bert_model_name,
    #     do_lower_case=True
    # )

    DROPOUT = 0.3
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    seed_value = 42
    test_size = 0.1

    sentiment_id = {
        'positive': 3893,
        'negative': 4997,
        'neutral': 8699
    }
    
    threshold = 0.3

    SPECIAL_TOKENS = ("[CLS]", "[SEP]")


In [None]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.settings = Settings
        self.bert = transformers.BertModel.from_pretrained(self.settings.bert_model_name, return_dict=False)
        self.l0 = nn.Linear(self.settings.input_dim, self.settings.output_dim)

    def forward(self, input_ids, attention_mask, token_type_ids):
        # not using sentiment at all
        sequence_output, pooled_output = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        # (batch_size, num_tokens, 768)
        logits = self.l0(sequence_output)
        # (batch_size, num_tokens, 2)
        # (batch_size, num_tokens, 1), (batch_size, num_tokens, 1)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        # (batch_size, num_tokens), (batch_size, num_tokens)

        return start_logits, end_logits


In [None]:
class Preprocess:
    def __init__(self):
        self.settings = Settings

    def clean_text(self, text):
        text = text.lower()
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', '', text)
        text = re.sub('\w*\d\w*', '', text)
        return text

    def process_data(self, tweet, selected_text, sentiment, tokenizer, max_len):
        tweet = " ".join(str(tweet).split())
        selected_text = " ".join(str(selected_text).split())

        len_st = len(selected_text)
        idx0 = None
        idx1 = None
        for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
            if tweet[ind: ind + len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st
                break

        char_targets = [0] * len(tweet)
        if idx0 is not None and idx1 is not None:
            for ct in range(idx0, idx1):
                char_targets[ct] = 1

        tok_tweet = tokenizer.encode(tweet)
        input_ids_orig = tok_tweet.ids[1:-1]
        tweet_offsets = tok_tweet.offsets[1:-1]

        target_idx = []
        for j, (offset1, offset2) in enumerate(tweet_offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        targets_start = target_idx[0]
        targets_end = target_idx[-1]

        input_ids = [101] + [self.settings.sentiment_id[sentiment]] + [102] + input_ids_orig + [102]
        token_type_ids = [0, 0, 0] + [1] * (len(input_ids_orig) + 1)
        mask = [1] * len(token_type_ids)
        tweet_offsets = [(0, 0)] * 3 + tweet_offsets + [(0, 0)]
        targets_start += 3
        targets_end += 3

        padding_length = max_len - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            mask = mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
            tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)

        return {
            'ids': input_ids,
            'mask': mask,
            'token_type_ids': token_type_ids,
            'targets_start': targets_start,
            'targets_end': targets_end,
            'orig_tweet': tweet,
            'orig_selected': selected_text,
            'sentiment': sentiment,
            'offsets': tweet_offsets
        }


In [None]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score


def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
class TextExtractionDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.settings = Settings
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = self.settings.TOKENIZER
        self.max_len = self.settings.MAX_LEN
    
    def __len__(self):
        return len(self.tweet)
    
    def __getitem__(self, item):
        tweet = " ".join(str(self.tweet[item]).split())
        selected_text = " ".join(str(self.selected_text[item]).split())
        
        len_st = len(selected_text)
        idx0 = -1
        idx1 = -1
        for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
            if tweet[ind: ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break

        char_targets = [0] * len(tweet)
        if idx0 != -1 and idx1 != -1:
            for j in range(idx0, idx1 + 1):
                if tweet[j] != " ":
                    char_targets[j] = 1
        
        tok_tweet = self.tokenizer.encode(sequence=self.sentiment[item], pair=tweet)
        tok_tweet_tokens = tok_tweet.tokens
        tok_tweet_ids = tok_tweet.ids
        tok_tweet_offsets = tok_tweet.offsets[3:-1]
        # print(tok_tweet_tokens)
        # print(tok_tweet.offsets)
        # ['[CLS]', 'spent', 'the', 'entire', 'morning', 'in', 'a', 'meeting', 'w', '/', 
        # 'a', 'vendor', ',', 'and', 'my', 'boss', 'was', 'not', 'happy', 'w', '/', 'them', 
        # '.', 'lots', 'of', 'fun', '.', 'i', 'had', 'other', 'plans', 'for', 'my', 'morning', '[SEP]']
        targets = [0] * (len(tok_tweet_tokens) - 4)
        if self.sentiment[item] == "positive" or self.sentiment[item] == "negative":
            sub_minus = 8
        else:
            sub_minus = 7

        for j, (offset1, offset2) in enumerate(tok_tweet_offsets):
            if sum(char_targets[offset1 - sub_minus:offset2 - sub_minus]) > 0:
                targets[j] = 1
        
        targets = [0] + [0] + [0] + targets + [0]

        #print(tweet)
        #print(selected_text)
        #print([x for i, x in enumerate(tok_tweet_tokens) if targets[i] == 1])
        targets_start = [0] * len(targets)
        targets_end = [0] * len(targets)

        non_zero = np.nonzero(targets)[0]
        if len(non_zero) > 0:
            targets_start[non_zero[0]] = 1
            targets_end[non_zero[-1]] = 1
        
        #print(targets_start)
        #print(targets_end)

        mask = [1] * len(tok_tweet_ids)
        token_type_ids = [0] * 3 + [1] * (len(tok_tweet_ids) - 3)

        padding_length = self.max_len - len(tok_tweet_ids)
        ids = tok_tweet_ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        targets = targets + ([0] * padding_length)
        targets_start = targets_start + ([0] * padding_length)
        targets_end = targets_end + ([0] * padding_length)

        sentiment = [1, 0, 0]
        if self.sentiment[item] == "positive":
            sentiment = [0, 0, 1]
        if self.sentiment[item] == "negative":
            sentiment = [0, 1, 0]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'tweet_tokens': " ".join(tok_tweet_tokens),
            'targets': torch.tensor(targets, dtype=torch.long),
            'targets_start': torch.tensor(targets_start, dtype=torch.long),
            'targets_end': torch.tensor(targets_end, dtype=torch.long),
            'padding_len': torch.tensor(padding_length, dtype=torch.long),
            'orig_tweet': self.tweet[item],
            'orig_selected': self.selected_text[item],
            'sentiment': torch.tensor(sentiment, dtype=torch.float),
            'orig_sentiment': self.sentiment[item]
        }

In [None]:
class Engine:
    def __init__(self):
        self.settings = Settings

    def loss_fn(self, start_logits, end_logits, start_positions, end_positions):
        l1 = nn.BCEWithLogitsLoss()(start_logits, start_positions)
        l2 = nn.BCEWithLogitsLoss()(end_logits, end_positions)
        total_loss = (l1 + l2)
        return total_loss

    def set_seed(self, seed_value=42):
        random.seed(seed_value)
        np.random.seed(seed_value)
        torch.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)

    def calculate_jaccard_score(self,
                                original_tweet,
                                target_string,
                                sentiment_val,
                                idx_start,
                                idx_end,
                                offsets_start,
                                offsets_end,
                                verbose=False):

        offsets = list(zip(offsets_start, offsets_end))

        if idx_end < idx_start:
            idx_end = idx_start

        filtered_output = ""
        original_tweet_sp = " ".join(original_tweet.split())
        for ix in range(idx_start, idx_end + 1):
            if offsets[ix][0] == 0 and offsets[ix][1] == 0:
                continue
            filtered_output += original_tweet_sp[offsets[ix][0]: offsets[ix][1]]
            if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
                filtered_output += " "

        filtered_output = filtered_output.replace(" .", ".")
        filtered_output = filtered_output.replace(" ?", "?")
        filtered_output = filtered_output.replace(" !", "!")
        filtered_output = filtered_output.replace(" ,", ",")
        filtered_output = filtered_output.replace(" ' ", "'")
        filtered_output = filtered_output.replace(" n't", "n't")
        filtered_output = filtered_output.replace(" 'm", "'m")
        filtered_output = filtered_output.replace(" do not", " don't")
        filtered_output = filtered_output.replace(" 's", "'s")
        filtered_output = filtered_output.replace(" 've", "'ve")
        filtered_output = filtered_output.replace(" 're", "'re")

        if sentiment_val == "neutral":
            filtered_output = original_tweet

        if sentiment_val != "neutral" and verbose == True:
            if filtered_output.strip().lower() != target_string.strip().lower():
                print("********************************")
                print(f"Output= {filtered_output.strip()}")
                print(f"Target= {target_string.strip()}")
                print(f"Tweet= {original_tweet.strip()}")
                print("********************************")

        jac = utils.jaccard(target_string.strip(), filtered_output.strip())
        return jac

    def train_fn(self, data_loader, model, optimizer, device, schedular):
        print("Starting training...\n")
        model.train()
        losses = AverageMeter()
        jaccards = AverageMeter()
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            
            # moving tensors to device

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.float)
            targets_end = targets_end.to(device, dtype=torch.float)

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because
            # accumulating the gradients is "convenient while training RNNs".
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()

            outputs_start, outputs_end = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids,
            )

            loss = self.loss_fn(outputs_start, outputs_end, targets_start, targets_end)

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc
            optimizer.step()
            # Update the learning rate
            schedular.step()
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)

    def eval_fn(self, data_loader, model, device):
        model.eval()
        losses = AverageMeter()
        jaccards = AverageMeter()
        all_outputs = []
        fin_outputs_start = []
        fin_outputs_end = []
        fin_tweet_tokens = []
        fin_padding_lens = []
        fin_orig_selected = []
        fin_orig_sentiment = []
        fin_orig_tweet = []
        fin_tweet_token_ids = []
        with torch.no_grad():
            tk0 = tqdm(data_loader, total=len(data_loader))
            for bi, d in enumerate(tk0):
                ids = d["ids"]
                token_type_ids = d["token_type_ids"]
                mask = d["mask"]
                tweet_tokens = d["tweet_tokens"]
                padding_len = d["padding_len"]
                sentiment = d["sentiment"]
                orig_selected = d["orig_selected"]
                orig_sentiment = d["orig_sentiment"]
                orig_tweet = d["orig_tweet"]
                targets_start = d["targets_start"]
                targets_end = d["targets_end"]

                ids = ids.to(device, dtype=torch.long)
                token_type_ids = token_type_ids.to(device, dtype=torch.long)
                mask = mask.to(device, dtype=torch.long)
                targets_start = targets_start.to(device, dtype=torch.float)
                targets_end = targets_end.to(device, dtype=torch.float)
                sentiment = sentiment.to(device, dtype=torch.float)

                outputs_start, outputs_end = model(
                    input_ids=ids,
                    attention_mask=mask,
                    token_type_ids=token_type_ids
                )
                loss = self.loss_fn(outputs_start, outputs_end, targets_start, targets_end)

                fin_outputs_start.append(torch.sigmoid(outputs_start).cpu().detach().numpy())
                fin_outputs_end.append(torch.sigmoid(outputs_end).cpu().detach().numpy())

                fin_padding_lens.extend(padding_len.cpu().detach().numpy().tolist())
                fin_tweet_token_ids.append(ids.cpu().detach().numpy().tolist())

                fin_tweet_tokens.extend(tweet_tokens)
                fin_orig_sentiment.extend(orig_sentiment)
                fin_orig_selected.extend(orig_selected)
                fin_orig_tweet.extend(orig_tweet)

                losses.update(loss.item(), ids.size(0))
                tk0.set_postfix(loss=losses.avg)

            fin_outputs_start = np.vstack(fin_outputs_start)
            fin_outputs_end = np.vstack(fin_outputs_end)
            fin_tweet_token_ids = np.vstack(fin_tweet_token_ids)
            threshold = self.settings.threshold

            jaccard_scores = []

            for j in range(len(fin_tweet_tokens)):
                target_string = fin_orig_selected[j]
                tweet_tokens = fin_tweet_tokens[j]
                padding_len = fin_padding_lens[j]
                original_tweet = fin_orig_tweet[j]
                sentiment_val = fin_orig_sentiment[j]

                if padding_len > 0:
                    mask_start = fin_outputs_start[j, :][:-padding_len] >= threshold
                    mask_end = fin_outputs_end[j, :][:-padding_len] >= threshold

                else:
                    mask_start = fin_outputs_start[j, 3:-1] >= threshold
                    mask_end = fin_outputs_end[j, 3:-1] >= threshold

                mask = [0] * len(mask_start)
                idx_start = np.nonzero(mask_start)[0]
                idx_end = np.nonzero(mask_end)[0]

                if len(idx_start) > 0:
                    idx_start = idx_start[0]
                    if len(idx_end) > 0:
                        idx_end = idx_end[0]
                    else:
                        idx_end = idx_start
                else:
                    idx_start = 0
                    idx_end = 0

                for mj in range(idx_start, idx_end + 1):
                    mask[mj] = 1

                output_tokens = [x for p, x in enumerate(tweet_tokens.split()) if mask[p] == 1]
                output_tokens = [x for x in output_tokens if x not in self.settings.SPECIAL_TOKENS]

                final_output = ""
                for ot in output_tokens:
                    if ot.startswith("##"):
                        final_output += ot[2:]
                    elif len(ot) == 1 and ot in string.punctuation:
                        final_output += ot
                    else:
                        final_output += " " + ot

                final_output = final_output.strip()

                if sentiment == "neutral" or len(original_tweet.split())<4:
                    final_output = original_tweet

                jac = jaccard(target_string.strip(),final_output.strip())

                jaccard_scores.append(jac)

        mean_jac = np.mean(jaccard_scores)
        # print(f"Jaccard score = {mean_jac}")
        return mean_jac


In [None]:
class Train:
    def __init__(self):
        # initialize required class
        self.settings = Settings
        self.engine = Engine()
        self.preprocess = Preprocess()

        # initialize required variables
        self.bert_text_model = None
        self.optimizer = None
        self.scheduler = None
        self.train_data_loader = None
        self.val_data_loader = None
        self.total_steps = None
        self.best_jaccard = 0
        self.param_optimizer = None
        self.optimizer_parameters = None
        self.total_steps = None
        self.train_data_loader = None
        self.validation_data_loader = None
        self.model_config = None

    def optimizer_params(self):
        self.param_optimizer = list(self.bert_text_model.named_parameters())
        self.optimizer_parameters = [
            {
                "params": [
                    p for n, p in self.param_optimizer if not any(nd in n for nd in self.settings.no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in self.param_optimizer if any(nd in n for nd in self.settings.no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]

    def __initialize(self):
        # Instantiate Bert Classifier
        # self.model_config = transformers.BertConfig.from_pretrained(self.settings.BERT_PATH)
        # self.model_config.output_hidden_states = True
        # self.bert_text_model = BERTBaseUncased(conf=self.model_config)
        self.bert_text_model = BERTBaseUncased()
        self.bert_text_model.to(self.settings.DEVICE)
        self.optimizer_params()

        # Create the optimizer
        self.optimizer = AdamW(self.optimizer_parameters,
                               lr=5e-5,  # Default learning rate
                               eps=1e-8  # Default epsilon value
                               )

        # Set up the learning rate scheduler
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                         num_warmup_steps=0,  # Default value
                                                         num_training_steps=self.total_steps)

    def create_data_loaders(self, tweet, sentiment, selected_text, batch_size, num_workers):
        dataset = TextExtractionDataset(tweet=tweet, sentiment=sentiment, selected_text=selected_text)
        data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)

        return data_loader

    def load_data(self, csv_data_path):
        df = pd.read_csv(csv_data_path).dropna().reset_index(drop=True)
        #df['text'] = df['text'].apply(lambda x: self.preprocess.clean_text(x))
        #df['selected_text'] = df['selected_text'].apply(lambda x: self.preprocess.clean_text(x))

        df_train, df_valid = model_selection.train_test_split(
            df,
            random_state=self.settings.seed_value,
            test_size=self.settings.test_size,
            stratify=df.sentiment.values

        )

        df_train = df_train.reset_index(drop=True)
        df_valid = df_valid.reset_index(drop=True)

        # creating Data Loaders
        # train data loader
        self.train_data_loader = self.create_data_loaders(tweet=df_train.text.values,
                                                          sentiment=df_train.sentiment.values,
                                                          selected_text=df_train.selected_text.values,
                                                          batch_size=self.settings.TRAIN_BATCH_SIZE,
                                                          num_workers=self.settings.TRAIN_NUM_WORKERS)
        # validation data loader
        self.validation_data_loader = self.create_data_loaders(tweet=df_valid.text.values,
                                                               sentiment=df_valid.sentiment.values,
                                                               selected_text=df_valid.selected_text.values,
                                                               batch_size=self.settings.TRAIN_BATCH_SIZE,
                                                               num_workers=self.settings.TRAIN_NUM_WORKERS)
        # validation data loader

        self.total_steps = int(len(df_train) / self.settings.TRAIN_BATCH_SIZE * self.settings.EPOCHS)

    def train(self):
        early_stopping = EarlyStopping(patience=5, mode="max")
        for epochs in range(self.settings.EPOCHS):
            self.engine.train_fn(data_loader=self.train_data_loader,
                                 model=self.bert_text_model,
                                 optimizer=self.optimizer,
                                 device=self.settings.DEVICE,
                                 schedular=self.scheduler)

            self.best_jaccard = self.engine.eval_fn(data_loader=self.validation_data_loader,
                                                    model=self.bert_text_model,
                                                    device=self.settings.DEVICE)

            print(f"Jaccard Score = {self.best_jaccard}")
            early_stopping(epoch_score=self.best_jaccard,
                           model=self.bert_text_model,
                           model_path=self.settings.WEIGHTS_PATH)
            if early_stopping.early_stop:
                print("Early stopping")
                break

    def run(self):
        try:
            print("Loading and Preparing the Dataset-----!! ")
            self.load_data(csv_data_path=self.settings.TRAIN_DATA)
            print("Dataset Successfully Loaded and Prepared-----!! ")
            print()
            print("-" * 70)
            print("Loading and Initializing the Bert Model -----!! ")
            self.__initialize()
            print("Model Successfully Loaded and Initialized-----!! ")
            print()
            print("-" * 70)
            print("------------------Starting Training-----------!!")
            self.engine.set_seed()
            self.train()
            print("Training complete-----!!!")

        except BaseException as ex:
            print("Following Exception Occurred---!! ", str(ex))


In [None]:
if __name__ == "__main__":
    t1=Train()
    t1.run()

## Inference

In [None]:
device = Settings.DEVICE

In [None]:
model = BERTBaseUncased()
model.to(device)
model.load_state_dict(torch.load(Settings.WEIGHTS_PATH))
model.eval()

In [None]:
df_test = pd.read_csv(Settings.TEST_DATA).dropna().reset_index(drop=True)
df_test.loc[:, "selected_text"] = df_test.text.values

In [None]:
df_test.head()

In [None]:
test_dataset = TextExtractionDataset(
        tweet=df_test.text.values,
        sentiment=df_test.sentiment.values,
        selected_text=df_test.selected_text.values
    )

data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=Settings.VALID_BATCH_SIZE,
    num_workers=Settings.VAL_NUM_WORKERS
)

In [None]:
all_outputs = []
fin_outputs_start = []
fin_outputs_end = []
fin_outputs_start2 = []
fin_outputs_end2 = []
fin_tweet_tokens = []
fin_padding_lens = []
fin_orig_selected = []
fin_orig_sentiment = []
fin_orig_tweet = []
fin_tweet_token_ids = []

with torch.no_grad():
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        tweet_tokens = d["tweet_tokens"]
        padding_len = d["padding_len"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_sentiment = d["orig_sentiment"]
        orig_tweet = d["orig_tweet"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        sentiment = sentiment.to(device, dtype=torch.float)

        outputs_start, outputs_end = model(
            input_ids=ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        
#         outputs_start2, outputs_end2 = model2(
#             ids=ids,
#             mask=mask,
#             token_type_ids=token_type_ids
#         )

        fin_outputs_start.append(torch.sigmoid(outputs_start).cpu().detach().numpy())
        fin_outputs_end.append(torch.sigmoid(outputs_end).cpu().detach().numpy())
#         fin_outputs_start2.append(torch.sigmoid(outputs_start2).cpu().detach().numpy())
#         fin_outputs_end2.append(torch.sigmoid(outputs_end2).cpu().detach().numpy())
        
        fin_padding_lens.extend(padding_len.cpu().detach().numpy().tolist())
        fin_tweet_token_ids.append(ids.cpu().detach().numpy().tolist())

        fin_tweet_tokens.extend(tweet_tokens)
        fin_orig_sentiment.extend(orig_sentiment)
        fin_orig_selected.extend(orig_selected)
        fin_orig_tweet.extend(orig_tweet)

fin_outputs_start = np.vstack(fin_outputs_start)
fin_outputs_end = np.vstack(fin_outputs_end)
# fin_outputs_start2 = np.vstack(fin_outputs_start2)
# fin_outputs_end2 = np.vstack(fin_outputs_end2)

# fin_outputs_start = (fin_outputs_start + fin_outputs_start2) / 2
# fin_outputs_end = (fin_outputs_end + fin_outputs_end2) / 2

fin_tweet_token_ids = np.vstack(fin_tweet_token_ids)
jaccards = []
threshold = 0.3
for j in range(len(fin_tweet_tokens)):
    target_string = fin_orig_selected[j]
    tweet_tokens = fin_tweet_tokens[j]
    padding_len = fin_padding_lens[j]
    original_tweet = fin_orig_tweet[j]
    sentiment_val = fin_orig_sentiment[j]

    if padding_len > 0:
        mask_start = fin_outputs_start[j, 3:-1][:-padding_len] >= threshold
        mask_end = fin_outputs_end[j, 3:-1][:-padding_len] >= threshold
        tweet_token_ids = fin_tweet_token_ids[j, 3:-1][:-padding_len]
    else:
        mask_start = fin_outputs_start[j, 3:-1] >= threshold
        mask_end = fin_outputs_end[j, 3:-1] >= threshold
        tweet_token_ids = fin_tweet_token_ids[j, 3:-1]

    mask = [0] * len(mask_start)
    idx_start = np.nonzero(mask_start)[0]
    idx_end = np.nonzero(mask_end)[0]
    if len(idx_start) > 0:
        idx_start = idx_start[0]
        if len(idx_end) > 0:
            idx_end = idx_end[0]
        else:
            idx_end = idx_start
    else:
        idx_start = 0
        idx_end = 0

    for mj in range(idx_start, idx_end + 1):
        mask[mj] = 1

    output_tokens = [x for p, x in enumerate(tweet_token_ids) if mask[p] == 1]

    filtered_output = Settings.TOKENIZER.decode(output_tokens)
    filtered_output = filtered_output.strip().lower()

    if sentiment_val == "neutral":
        filtered_output = original_tweet

    all_outputs.append(filtered_output.strip())

In [None]:
sample = pd.read_csv("../input/tweet-sentiment-extraction/sample_submission.csv")
sample.loc[:, 'selected_text'] = all_outputs
sample.to_csv("submission.csv", index=False)

In [None]:
sample.head()