## DistilBERT Question And Answer - PreEntrenado con el dataset Stanford Question Answering Dataset

Para la implementación inicial de BERT usaremos inicialmente DistilBERT y la librería simpletransformers para un implementación predefinida de Question and Answer basada en BERT. El modelo pre-entrenado de BERT elegido será distilbert-base-uncased-distilled-squad, el cual es entrenado con un extenso dataset de la universidad de Standford enfocado a problemas QA.

In [None]:
import pandas as pd
import numpy as np
import json
import re

In [None]:
train = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
test = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")
sample_submission = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/sample_submission.csv")

In [None]:
train.shape, test.shape

In [None]:
#Reference https://www.kaggle.com/parulpandey/eda-and-preprocessing-for-bert

def clean(tweet):
    tweet = str(tweet)

    tweet=tweet.lower()

    #Remove html tags
    tweet=re.sub('<.*?>','',tweet)

    #Remove text in square brackets
    tweet=re.sub('\[.*?\]','',tweet)

    #Remove hyperlinks
    tweet=re.sub('https?://\S+|www\.\S+','',tweet)


    return tweet

In [None]:
train.dropna(inplace = True)
train["text"] = train["text"].apply(lambda x : x.strip())
train["selected_text"] = train["selected_text"].apply(lambda x : x.strip())
train.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test=train_test_split(train[['text','textID','sentiment']],train['selected_text'],
                                               test_size=0.2,random_state=42,stratify=train['sentiment'])


X_train.reset_index(inplace=True,drop=True)
X_test.reset_index(inplace=True,drop=True)

Y_train=Y_train.reset_index(drop=True)
Y_test=Y_test.reset_index(drop=True)

print('X_train Forma',X_train.shape,' Y_train Forma ',Y_train.shape)
print('X_test Forma',X_test.shape,' Y_test Forma ',Y_test.shape)

In [None]:
X_train_Temp = X_train.copy()
X_train_Temp['selected_text'] = Y_train

In [None]:
X_test_Temp = X_test.copy()
X_test_Temp['selected_text'] = Y_test

In [None]:
X_train_Temp = X_train_Temp[['textID', 'text', 'selected_text', 'sentiment']]
X_train_Temp.head()

In [None]:
X_test_Temp = X_test_Temp[['textID', 'text', 'selected_text', 'sentiment']]
X_test_Temp.head()

In [None]:
train_array = np.array(X_train_Temp)
test_array = np.array(X_test_Temp)
use_cuda = True

In [None]:
# Búsqueda de indice de inicio
def start_index(text, selected_text):
    start_index = text.lower().find(selected_text.lower())
    l.append(start_index)
    
l = []
for i in range(len(train_array)):
    start_index(train_array[i, 1], train_array[i, 2])

In [None]:
# pregunta --> sentimiento
# contexto --> texto tweet
# respuesta --> texto seleccionado

def quesa_format_train(train):
    out = []
    for i, row in enumerate(train):
        qas = []
        con = []
        ans = []
        question = row[-1]
        answer = row[2]
        context = row[1]
        qid = row[0]
        answer_start = l[i]
        ans.append({"answer_start": answer_start, "text": answer.lower()})
        qas.append({"question": question, "id": qid, "is_impossible": False, "answers": ans})
        out.append({"context": context.lower(), "qas": qas})

    return out
        
    
train_json_format = quesa_format_train(train_array)
with open('train.json', 'w') as outfile:
    json.dump(train_json_format, outfile)

In [None]:
# Similar a los datos de entrenamiento

def quesa_format_test(train):
    out = []
    for i, row in enumerate(train):
        qas = []
        con = []
        ans = []
        question = row[-1]
#         answer = row[2]
        context = row[1]
        qid = row[0]
        answer_start = l[i]
        ans.append({"answer_start": 1000000, "text": "__None__"})
        qas.append({"question": question, "id": qid, "is_impossible": False, "answers": ans})
        out.append({"context": context.lower(), "qas": qas})
    return out
        
    
test_json_format = quesa_format_test(test_array)

with open('test.json', 'w') as outfile:
    json.dump(test_json_format, outfile)

In [None]:
!pip install '../input/simple-transformers-pypi/seqeval-0.0.12-py3-none-any.whl' -q
!pip install '../input/simple-transformers-pypi/simpletransformers-0.22.1-py3-none-any.whl' -q

In [None]:
from simpletransformers.question_answering import QuestionAnsweringModel

model_path = '/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/'
model_path_ready = './model-distilbert'

# Creación del modelo
model = QuestionAnsweringModel('distilbert', 
                               model_path, 
                               args={'reprocess_input_data': True,
                                     'overwrite_output_dir': True,
                                     'learning_rate': 5e-5,
                                     'num_train_epochs': 4,
                                     'max_seq_length': 128,
                                     'output_dir': './model-distilbert',
                                     'doc_stride': 64,
                                     'fp16': False,
                                    },
                              use_cuda=use_cuda)

model.train_model('train.json')

In [None]:
pred = model.predict(test_json_format)

In [None]:
df = pd.DataFrame.from_dict(pred)
df_final = X_test_Temp.copy()
df_final['pred'] =  df['answer']

In [None]:
def jaccard(str1, str2):
  a = set(str(str1).lower().split()) 
  b = set(str(str2).lower().split())
  c = a.intersection(b)
  return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def compute_jaccard(Y):
    all_jaccard = []
    for i in range(len(Y)):
        score = jaccard(Y.iloc[i]["selected_text"], Y.iloc[i]["pred"])
        all_jaccard.append(score)
    return np.mean(np.array(all_jaccard))

In [None]:
score_total = compute_jaccard(df_final)
score_total