In [None]:
import string
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import os
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

import warnings
warnings.filterwarnings("ignore")

In [None]:
BASE_PATH = '../input/tweet-sentiment-extraction/'
train_df = pd.read_csv(BASE_PATH+ 'train.csv')
# train_df = train_df[:1000]
test_df = pd.read_csv( BASE_PATH+ 'test.csv')
# test_df = test_df[:1000]

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
# remove any rows containing nan values
train_df= train_df.dropna()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.describe()

In [None]:
#jaccard method 
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
train_df.head()

# Begin training data

In [None]:
# def train(train_data, output_dir, n_iter=20, model=None):
def train(train_data, nlp, n_iter=20 ):
    """Load the model, set up the pipeline and train the entity recognizer."""
    ""
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # add label to the model, which is always 'selected_text'
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    # we are interested only in 'ner' pipeline
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
#     nlp.begin_training()
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()

        for itn in tqdm(range(n_iter)):
            #shuffle the data
            random.shuffle(train_data)
            # making batches of train_data 
            # size of the batch is determined by compounding func, 
            # which yield an infinite series of compounding values. 
            # Each time the generator is called, a value is produced by
            # multiplying the previous value by the compound rate.
            # in this case min batch size is 5, max is 500, compound rate of 1.001 per iteration
            batches = minibatch(train_data, size=compounding(start=4.0, stop=500.0, compound=1.001))    
            batch_len = 0
            # dict to store losses info during training
            losses = {}
            for batch in batches:
                batch_len +=1
                texts, annotations = zip(*batch)
                # update the model
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,   # dropout rate, preventing overfitting, use default 0.5 rate
                    losses=losses, 
                    sgd=optimizer #optimizer
                )



In [None]:
# creating data in spacy data input format, like this:
# {
#     text_string,
#     entities: [{start_index, end_index, ENTITY}]
# }

def preprocess_data(train_df, sentiment):
    train_data = []
    for row in train_df.itertuples():
        if row.sentiment == sentiment:
            selected_text = row.selected_text
            text = row.text
            start = text.find(selected_text)
            end = start + len(selected_text)
            train_data.append((text, {"entities": [[start, end, 'selected_text']]}))
    return train_data


In [None]:
# Traing for positive sentiment

sentiment = 'positive'

processed_train_data = preprocess_data(train_df, sentiment)
# model_path = get_model_out_path(sentiment)

In [None]:
# create a blank english language model
model_pos = spacy.blank("en")
# train the data for 4 iterations, more tends to overfit
train(processed_train_data, model_pos, n_iter=4 )

In [None]:
# Traing for negative sentiment

sentiment = 'negative'

processed_train_data = preprocess_data(train_df, sentiment)
# model_path = get_model_out_path(sentiment)


In [None]:
# create a blank english language model
model_neg = spacy.blank("en")
# train the data for 4 iterations
train(processed_train_data, model_neg, n_iter=4 )



In [None]:
#Ignore neutral sentiment

In [None]:
# Making prediction

In [None]:
# pass text into model and return selected_text
def predict_entities(text, model):
    doc = model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
    return selected_text


In [None]:
selected_texts = []
# loop through the test data and generate predictions
for row in test_df.itertuples():
    text = row.text
    output_str = ""
    if row.sentiment == 'neutral':
        selected_texts.append(text)
    elif row.sentiment == 'positive':
        selected_texts.append(predict_entities(text, model_pos))
    else:
        selected_texts.append(predict_entities(text, model_neg))

df_submission = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')
df_submission['selected_text'] = selected_texts
df_submission.to_csv("submission.csv", index=False)