In [None]:
import pandas as pd
import numpy as np
import torch
import sklearn
import json
import tokenizers
import os

In [None]:
# install neccessary packages
!pip install '../input/required-packages/seqeval-0.0.12-py3-none-any.whl' -q
!pip install ../input/simpletransformers-0323-pypi/simpletransformers-0.32.3-py3-none-any.whl -q

In [None]:
INPUT_FILES_PATH = "/kaggle/input/tweet-sentiment-extraction/"

In [None]:
df_train = pd.read_csv(INPUT_FILES_PATH + "train.csv")
# df_train = df_train[:1000]
df_train = df_train.dropna()
df_train.reset_index(drop=True, inplace=True)

df_test = pd.read_csv(INPUT_FILES_PATH + "test.csv")
# df_test = df_test[:1000]
df_test = df_test.dropna()
df_test.reset_index(drop=True, inplace=True)


sample_submission = pd.read_csv(INPUT_FILES_PATH + "sample_submission.csv")


In [None]:
# use GPU to do calculation, if the computer support it
use_cuda = torch.cuda.is_available()
use_cuda

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_test.head()

In [None]:
def getSelectedTextStartIndex(text, selected_text):
    return text.lower().find(selected_text.lower())

start_indices = []
for tupl in df_train.itertuples():
    start_indices.append(getSelectedTextStartIndex(tupl.text, tupl.selected_text))

# Format of training data for NER task

In [None]:
# Data format
# Must be a DataFrame like below

# train_data = [
#     [0, "Harry", "B-PER"],
#     [0, "Potter", "I-PER"],
#     [0, "was", "O"],
#     [0, "a", "O"],
#     [0, "student", "B-MISC"],
#     [0, "at", "O"],
#     [0, "Hogwarts", "B-LOC"],

#     train_data, columns=["sentence_id", "words", "labels"]
# )


In [None]:
# There will be 2 labels,
# IGNORE: which means a word is not in selected_text
# SELECTED_TEXT: which means a word is  in selected_text, we care about this

In [None]:
#preprocess train data
def train_preprocessing(df_train):
    out = []
    for tupl in df_train.itertuples():
        #find the start and end indices of  selected_text
        start_index = start_indices[tupl.Index]
        end_index = start_index+len(tupl.selected_text)
        #use this to track what is the current character in text
        char_index = 0 
        for word_ind, word in enumerate(tupl.text.split(" ")):
            out.append((tupl.Index, word, "IGNORE"))
            for _, sel_word in enumerate(tupl.selected_text.split(" ")):
                 #make sure we pick only words where the range of selected_text is
                if (char_index >=start_index and char_index < end_index and
                    word.find(sel_word) != -1):
                    out[-1] = ((tupl.Index, word, "SELECTED_TEXT"))
            char_index += len(word) +1            
    return pd.DataFrame(out, columns=["sentence_id", "words", "labels"])

# Preparing paths to use

In [None]:
MODEL_PATH = './bert_qa_data/output/'
MODEL_POS = MODEL_PATH + 'model_pos'
MODEL_NEG = MODEL_PATH + 'model_neg'

for path in [MODEL_PATH, MODEL_POS, MODEL_NEG]:
    if not os.path.exists(path):
        os.makedirs(path)

# Create the model using simpletransformers

## Using distilbert
distilbert is a smaller general-purpose language representation model of bert but retained most of the accuracy



In [None]:
from simpletransformers.ner import NERModel

BERT_PATH = "../input/distilbertbaseuncased"

labels= ["SELECTED_TEXT", "IGNORE"];

# Create the models
NER_model_pos = NERModel('distilbert', 
                               BERT_PATH, 
                               args={
                                    'reprocess_input_data': True,#If True, the input data will be reprocessed even if a cached file of the input data exists in the cache_dir.
                                     'overwrite_output_dir': True, #If True, the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory.
                                     'learning_rate': 5e-5, #The learning rate for training.
                                     'num_train_epochs': 2, #The number of epochs the model will be trained for.
                                     'doc_stride': 64, #When splitting up a long document into chunks, how much stride to take between chunks.
                                     'fp16': False,#fp16 (half-precision floating points) mode on or off, 

                                    },
                              labels= labels,
                              use_cuda=use_cuda)
NER_model_neg = NERModel('distilbert', 
                               BERT_PATH, 
                               args={
                                    'reprocess_input_data': True,#If True, the input data will be reprocessed even if a cached file of the input data exists in the cache_dir.
                                     'overwrite_output_dir': True, #If True, the trained model will be saved to the ouput_dir and will overwrite existing saved models in the same directory.
                                     'learning_rate': 5e-5, #The learning rate for training.
                                     'num_train_epochs': 2, #The number of epochs the model will be trained for.
                                     'doc_stride': 64, #When splitting up a long document into chunks, how much stride to take between chunks.
                                     'fp16': False,#fp16 (half-precision floating points) mode on or off, 

                                    },
                              labels= labels,
                              use_cuda=use_cuda)

## Training positive tweets

In [None]:
import time
start = time.time()

train_data_formatted_df = train_preprocessing(df_train[df_train["sentiment"]=="positive"])
NER_model_pos.train_model(train_data = train_data_formatted_df, output_dir=MODEL_POS, show_running_loss=True)

end = time.time()
print(end - start)

## Training negative tweets

In [None]:
import time
start = time.time()

train_data_formatted_df = train_preprocessing(df_train[df_train["sentiment"]=="negative"])
NER_model_neg.train_model(train_data = train_data_formatted_df, output_dir=MODEL_NEG, show_running_loss=True)

end = time.time()
print(end - start)

## Ignore neutral tweets

# Make prediction

## Predict positive data

In [None]:
df_test_pos = df_test[df_test["sentiment"]=="positive"].copy()
df_test_pos.head()

In [None]:
test_data = list(df_test_pos["text"])

In [None]:
predictions_pos, _ = NER_model_pos.predict(test_data)

## Predict negative data

In [None]:
df_test_neg = df_test[df_test["sentiment"]=="negative"].copy()
df_test_neg.head()

In [None]:
test_data = list(df_test_neg["text"])

In [None]:
predictions_neg, _ = NER_model_neg.predict(test_data)

## Ignore neutral tweets

# Make submission

In [None]:
#convert predictions to a format we can use
def prediction_to_string(predictions):
    out = []
    for sentence in predictions:
        out_sentence = []
        for word_and_tag in sentence: #word_and_tag is a dict
            for word, tag in word_and_tag.items():
                if tag=="SELECTED_TEXT":
                    out_sentence.append(word)
        out.append(" ".join(out_sentence))
    return out

In [None]:
# combine predictions
df_test_pos["selected_text"] = pd.DataFrame(prediction_to_string(predictions_pos), columns=["selected_text"], index=df_test_pos.index.copy())
df_test_neg["selected_text"] = pd.DataFrame(prediction_to_string(predictions_neg), columns=["selected_text"], index=df_test_neg.index.copy())
df_test_neu = df_test[df_test["sentiment"] == "neutral"].copy()
df_test_neu.rename(columns = {"text": "selected_text" }, inplace=True)
submission_df = pd.concat([df_test_pos, df_test_neg, df_test_neu], axis=0)
submission_df.sort_index(inplace=True)

In [None]:
submission_df.head()

In [None]:
submission_df[["textID", "selected_text"]].to_csv("submission.csv", index = False)
print("File submitted successfully.")

