In [None]:
import pandas as pd
import numpy as np
import torch
import os
import sklearn
import json


In [None]:
# install neccessary packages

!pip install '../input/required-packages/seqeval-0.0.12-py3-none-any.whl' -q
!pip install '../input/required-packages/simpletransformers-0.22.1-py3-none-any.whl' -q

In [None]:
INPUT_FILES_PATH = "/kaggle/input/tweet-sentiment-extraction/"

In [None]:
df_train = pd.read_csv(INPUT_FILES_PATH + "train.csv")
# df_train = df_train[:1000]
df_train = df_train.dropna()
df_train.reset_index(drop=True, inplace=True)

df_test = pd.read_csv(INPUT_FILES_PATH + "test.csv")
# df_test = df_test[:1000]
df_test = df_test.dropna()
df_test.reset_index(drop=True, inplace=True)


sample_submission = pd.read_csv(INPUT_FILES_PATH + "sample_submission.csv")


In [None]:
# use GPU to do calculation, if the computer support it
use_cuda = torch.cuda.is_available()
use_cuda

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_test.head()

In [None]:
def getSelectedTextStartIndex(text, selected_text):
    return text.lower().find(selected_text.lower())

start_indices = []
for tupl in df_train.itertuples():
    start_indices.append(getSelectedTextStartIndex(tupl.text, tupl.selected_text))

In [None]:
# To train question-asnwering tasks, we need context, question and answer
# Treat sentiment as question
# text as context
# selected_text as answer

In [None]:
# Data format

# For question answering tasks, the input data can be in JSON files or in a Python list of dicts in the correct format.

# The file should contain a single list of dictionaries. A dictionary represents a single context and its associated questions.

# Each such dictionary contains two attributes, the "context" and "qas".

#     context: The paragraph or text from which the question is asked.
#     qas: A list of questions and answers.

# Questions and answers are represented as dictionaries. Each dictionary in qas has the following format.

#     id: (string) A unique ID for the question. Should be unique across the entire dataset.
#     question: (string) A question.
#     is_impossible: (bool) Indicates whether the question can be answered correctly from the context.
#     answers: (list) The list of correct answers to the question.

# A single answer is represented by a dictionary with the following attributes.

#     answer: (string) The answer to the question. Must be a substring of the context.
#     answer_start: (int) Starting index of the answer in the context.


In [None]:
# convert data into JSON and follows simpletransformers format
#preprocess train data
def train_preprocessing(df_train):
    out = []
    for tupl in df_train.itertuples():
        #question and answer
       
        question = tupl.sentiment
        answer = tupl.selected_text
        context = tupl.text
        qid = tupl.Index
        answer_start = start_indices[qid]
        
        ans = [{"answer_start": answer_start, "text": answer.lower()}]
        qas = [{"question": question, "id": qid, "is_impossible": False, "answers": ans}]
        out.append({"context": context.lower(), "qas": qas})

    return out

#preprocess test data, no answer provided
def test_preprocessing(df_test):
    out = []
    for tupl in df_test.itertuples():
        #question and answer
        qas = []
        con = []
        ans = []
        question = tupl.sentiment
        context = tupl.text
        qid = tupl.Index
        
        ans = [{"answer_start": 1000000, "text": "__None__"}]
        qas = [{"question": question, "id": qid, "is_impossible": False, "answers": ans}]
        out.append({"context": context.lower(), "qas": qas})

    return out

# Preparing paths to use

In [None]:
MODEL_PATH = './bert_qa_data/output/'
MODEL_POS = MODEL_PATH + 'model_pos'
MODEL_NEG = MODEL_PATH + 'model_neg'

for path in [MODEL_PATH, MODEL_POS, MODEL_NEG]:
    if not os.path.exists(path):
        os.makedirs(path)

# Create the model using simpletransformers

## Using distilbert-base-uncased-distilled-squad
distilbert is a smaller general-purpose language representation model of bert but retained most of the accuracy


this model has been pretrained to specifically suited to question answering task using SQuAD - the Stanford Question Answering Dataset

In [None]:
from simpletransformers.question_answering import QuestionAnsweringModel
import time
BERT_PATH = "../input/required-packages/distilbert-base-uncased-distilled-squad"
start = time.time()

args={
 'reprocess_input_data': True,#If True, the input data will be reprocessed even if a cached file of the input data exists in the cache_dir.
 'overwrite_output_dir': True, #If True, the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory.
 'learning_rate': 5e-5, #The learning rate for training.
 'num_train_epochs': 4, #The number of epochs the model will be trained for.
 'doc_stride': 64, #When splitting up a long document into chunks, how much stride to take between chunks.
 'fp16': False,#fp16 (half-precision floating points) mode on or off, 
}

# Create the models
QA_model_pos = QuestionAnsweringModel('distilbert', BERT_PATH, args=args, use_cuda=use_cuda)
QA_model_neg = QuestionAnsweringModel('distilbert', BERT_PATH, args=args, use_cuda=use_cuda)

end = time.time()
print(end - start)

# Training positive tweets

In [None]:
train_data = train_preprocessing(df_train[df_train["sentiment"] == "positive"])

QA_model_pos.train_model(train_data = train_data, output_dir=MODEL_POS, show_running_loss=True )

# Training negative tweets

In [None]:
train_data = train_preprocessing(df_train[df_train["sentiment"] == "negative"])

QA_model_neg.train_model(train_data = train_data, output_dir=MODEL_POS, show_running_loss=True )

# Ignore neutral tweets

# Make prediction

## Predict positive data

In [None]:
df_test_pos = df_test[df_test["sentiment"]=="positive"].copy()
df_test_pos.head()

In [None]:
predictions_pos = QA_model_pos.predict(test_preprocessing(df_test_pos))

## Predict negative data

In [None]:
df_test_neg = df_test[df_test["sentiment"]=="negative"].copy()
df_test_neg.head()

In [None]:
predictions_neg = QA_model_neg.predict(test_preprocessing(df_test_neg))

## Ignore neutral tweets

# Make submission

In [None]:
# combine predictions
prediction_test_pos = pd.DataFrame.from_dict(predictions_pos)
prediction_test_pos.index = df_test_pos.index.copy()

prediction_test_neg = pd.DataFrame(predictions_neg)
prediction_test_neg.index = df_test_neg.index.copy()

prediction_test_neu = df_test[df_test["sentiment"] == "neutral"].copy()
prediction_test_neu.rename(columns = {"text": "answer" }, inplace=True)

submission_df = pd.concat([prediction_test_pos, prediction_test_neg, prediction_test_neu], axis=0)
submission_df.sort_index(inplace=True)

In [None]:
submission_df.head()

In [None]:
sample_submission["selected_text"] = submission_df["answer"]

sample_submission.to_csv("submission.csv", index = False)
print("File submitted successfully.")

