In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip install --upgrade transformers 
# !pip install tokenizers==0.9.4

In [None]:
# the OG 
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import style


# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.utils.class_weight import compute_class_weight

#NLP Specifics
from nltk import word_tokenize
import nltk
from tokenizers import BertWordPieceTokenizer
from transformers import BertModel, BertConfig,TFBertForTokenClassification
from transformers import (AdamW,get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup,RobertaTokenizerFast,DistilBertTokenizerFast,PreTrainedTokenizerFast,AutoModel,
                          DistilBertTokenizerFast,GPT2TokenizerFast,AutoTokenizer,BertTokenizer,TFBertModel,TFOpenAIGPTModel,OpenAIGPTTokenizer,DistilBertTokenizer, TFDistilBertModel,XLMTokenizer, TFXLMModel,TFBertForSequenceClassification,TFGPT2Model,TFXLMRobertaModel)

#tensorflow
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.initializers import Constant

#other imports
import urllib
import os
import gc
from tqdm import tqdm
import re
import random
from typing import Callable, List, Optional, Union

In [None]:
df_train=pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
df_test=pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.dropna(inplace=True)

In [None]:
## Some Bugs in Data-Needed Manual Filtering-if possible
df_train.iloc[18,2]='gonna'
print(df_train.iloc[32,2])
print(df_train.iloc[32,1])

In [None]:
def max_text_len():
    return max(df_train['text'].map(lambda x:len(x)))
print('The max len in train-set is : {}'.format(max_text_len()))

In [None]:
## Configuration
BATCH_SIZE = 64
EPOCHS=100
LEARNING_RATE=1e-7
BETA_1=0.9
BETA_2=0.999
AUTO = tf.data.experimental.AUTOTUNE
steps_per_epoch=df_train.shape[0]//BATCH_SIZE
EARLY_STOP=tf.keras.callbacks.EarlyStopping(patience=10)
MAX_LEN=150
LOSS=tf.keras.losses.CategoricalCrossentropy()
OPTIMIZER=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE,beta_1=BETA_1,beta_2=BETA_2)


In [None]:
class LossHistory(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

In [None]:
class preprocessing_data(object):
    def __init__(self,tokenizer,MAXLEN=MAX_LEN,ATTENTION_MASK=True,PADDING=True):
        self.tokenizer=tokenizer.from_pretrained('bert-base-uncased')
        self.tokenizer._pad_token='[PAD]'
        self.maxlen=MAXLEN
        self.padding=PADDING
        self.attention_mask=ATTENTION_MASK
    def encode_text(self,text,sentiment):
        if self.maxlen:
            tokenize=self.tokenizer.encode_plus(text=text,text_pair=sentiment,max_length=self.maxlen,padding='max_length',return_attention_mask=self.attention_mask,
                                               return_token_type_ids=True)
        else:
            tokenize=self.tokenizer.encode_plus(text=text,text_pair=sentiment,max_length=self.maxlen,padding=self.padding,return_attention_mask=self.attention_mask,
                                               return_token_type_ids=True)
        input_ids=tokenize['input_ids']
        attention_mask=tokenize['attention_mask']
        token_type_ids=tokenize['token_type_ids']

        return (input_ids,attention_mask,token_type_ids)    
    
    def encode_labels(self,selected_text):
        input_ids=self.tokenizer.encode_plus(text=selected_text)
        return input_ids['input_ids']
    

In [None]:
def preprocess_dataset(data,tokenizer,train=True):
    input_ids=[]
    attention_mask=[]
    token_type_ids=[]
    labels=[]
    final_dict={}
    
    print('Creating X- Data')
    
    preprocessor=preprocessing_data(tokenizer)
    for text,sentiment in tqdm(zip(data['text'],data['sentiment'])):
        instance=preprocessor.encode_text(text=text,sentiment=sentiment)
        input_ids.append(instance[0])
        attention_mask.append(instance[1])
        token_type_ids.append(instance[2])
    X_input_ids=np.array(input_ids)
    X_attention_mask=np.array(attention_mask)
    X_token_type_ids=np.array(token_type_ids)
    
    y_labels=[]
    if train:
        print('Creating Y-Labels')
        for X_input_id,selected_text in tqdm(zip(X_input_ids,data['selected_text'])):
            empty_list=np.zeros((X_input_ids.shape[1],))
            selected_text_token_ids=preprocessor.encode_labels(selected_text)
            ##Very Slow-Method but will have to run onetime-need to find a method of vectorization
            for _id_ in selected_text_token_ids:
                if _id_==101:
                    pass
                elif _id_==102:
                    break
                else:
                    index=0
                    for __id in  X_input_id:
                        if __id==0 or __id==102:
                            break
                        else:
                            if _id_==__id and empty_list[index]!=1:
                                empty_list[index]=1
                                break
                            else:
                                pass
                        index+=1
            y_labels.append(empty_list)
#             first_id_index=np.where(X_input_id==first_id)[0][0]
#             last_id_index=np.where(X_input_id==last_id)[-1][-1]
#             empty_list[first_id_index:last_id_index+1]=1
        y_labels=np.array(y_labels)
        final_dict['y_labels']=y_labels
    
    final_dict['input_ids']=X_input_ids
    final_dict['attention_mask']=X_attention_mask
    final_dict['token_type_ids']=X_token_type_ids
    
    return final_dict
    
        
    
    
def tensorflow_dataset(dict_data,batch_size,train=True):
    train_dict={}
    valid_dict={}
    if train:
        X_input_ids=dict_data['input_ids']
        X_attention_mask=dict_data['attention_mask']
        X_token_type_ids=dict_data['token_type_ids']
        y_labels=dict_data['y_labels']

        X_train_input_ids,X_valid_input_ids=train_test_split(X_input_ids,test_size=0.05,random_state=42)
        X_train_attention_mask,X_valid_attention_mask=train_test_split(X_attention_mask,test_size=0.05,random_state=42)
        X_train_token_type_ids,X_valid_token_type_ids=train_test_split(X_token_type_ids,test_size=0.05,random_state=42)
        y_train,y_valid=train_test_split(y_labels,test_size=0.05,random_state=42)
        
        
        
        train_dict['input_ids']=X_train_input_ids
        train_dict['attention_mask']=X_train_attention_mask
        train_dict['token_type_ids']=X_train_token_type_ids
        
        valid_dict['input_ids']=X_valid_input_ids
        valid_dict['attention_mask']=X_valid_attention_mask
        valid_dict['token_type_ids']=X_valid_token_type_ids
        
        
        
        train=tf.data.Dataset.from_tensor_slices((train_dict,y_train)).repeat().shuffle(1024).batch(batch_size).prefetch(AUTO)
        valid=tf.data.Dataset.from_tensor_slices((valid_dict,y_valid)).batch(BATCH_SIZE).cache().prefetch(AUTO)

    else:
        return tf.data.Dataset((dict_data))
    return train,valid,y_labels
dict_data=preprocess_dataset(data=df_train,tokenizer=PreTrainedTokenizerFast)
train,valid,y_train=tensorflow_dataset(dict_data,batch_size=32,train=True)

In [None]:
def TFBertForTokenClassification_Model():
    # Initializing a BERT bert-base-uncased style configuration
    configuration = BertConfig()
    configuration.num_labels=2
    
    input_ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
    attention_mask=tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")
    token_type_ids=tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name="token_type_ids")
    
    # Initializing a model from the bert-base-uncased style configuration
    model = TFBertForTokenClassification(configuration).from_pretrained('bert-base-cased')
    
    output=model([input_ids, attention_mask, token_type_ids],training=True,return_dict=True)
    logit=output.logits 
    out=logit[:,:,1]
    model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids],outputs=out)
    return model

def compile_TFBertForTokenClassification_Model(optim):
    model=TFBertForTokenClassification_Model()
    model.compile(optimizer=optim,loss=LOSS,metrics='accuracy')
    print(model.summary())
    return model    


model=compile_TFBertForTokenClassification_Model(OPTIMIZER)
history = LossHistory()
model.fit(
      train,steps_per_epoch=steps_per_epoch,
      epochs=EPOCHS,callbacks=[history,EARLY_STOP], validation_data=valid,
)
  

In [None]:
##Bugging Terminal Lol xD
test_id=9202
preprocessor=preprocessing_data(PreTrainedTokenizerFast)
print(df_train.iloc[test_id,:])
print(dict_data['input_ids'][test_id,:])
empty_list=np.zeros((150,))
print(preprocessor.encode_labels(df_train.iloc[test_id,2]))
print(y_train[test_id,:])
for _id_ in preprocessor.encode_labels(df_train.iloc[test_id,2]):
        if _id_==101:
            pass
        elif _id_==102:
            break
        else:
            index=0
            for __id in  dict_data['input_ids'][test_id,:]:
                if __id==0 or __id==102:
                    break
                else:
                    if _id_==__id and empty_list[index]!=1:
                        empty_list[index]=1
                        break
                    else:
                        pass
                index+=1
print(empty_list)