# Tweet sentiment extraction - SpaCy
post 2020-11-10

# Notes
* Pinched from [sentiment Extaction-Analysis,EDA and Model](https://www.kaggle.com/dplutcho/twitter-sentiment-extaction-analysis-eda-and-model/edit)

# Prep

## Imports

In [None]:
import os
from random import sample
import zipfile
from zipfile import ZipFile

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Spacy model building related.
import spacy
from tqdm import tqdm
import random
from spacy.util import minibatch, compounding

import warnings
warnings.filterwarnings("ignore")

In [None]:
spacy.__version__

##  Prep train & test sets

In [None]:
df_test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
df_test.head()

In [None]:
df_train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
display(len(df_train))
df_train.head()

In [None]:
# drop na - only one of these.
print(len(df_train))
df_train.dropna(axis = 0, how ='any',inplace=True)
print(len(df_train))

In [None]:
# Add tokes and counts.
df_train['text_tokes']   = df_train.text.str.split()
df_train['select_tokes'] = df_train.selected_text.str.split()
df_train['text_tokes_cnt'] = df_train.text_tokes.str.len()
df_train['select_tokes_cnt'] = df_train.select_tokes.str.len()
df_train.head(5)

In [None]:
# Collect all sample with full tokes as anser. Not extraction needed.
df_neut2 = df_train[(df_train.sentiment=='neutral')]
df_neut1 = df_train[(df_train.text_tokes_cnt<=2)]
df_neut  = pd.concat([df_neut1, df_neut2]).reset_index()
display(len(df_neut))
display(df_neut.sentiment.value_counts())
df_neut.sample(5)

In [None]:
pd.concat([df_neut1, df_neut2]).reset_index()

In [None]:
# df with all the result that are self.
df_neut = df_train[(df_train.text_tokes_cnt<=2)]
df_neut = df_train[(df_train.sentiment=='neutral')]
display(len(df_neut))
display(df_neut.sentiment.value_counts())
df_neut.sample(5)

In [None]:
# remove text=2 and neutrals as = self.
df_train = df_train[~(df_train.text_tokes_cnt<=2)]
df_train = df_train[(df_train.sentiment!='neutral')]
display(len(df_train))
display(df_train.sentiment.value_counts())
df_train.sample(5)

# Helpers

## App Settings

In [None]:
!pwd
!ls

In [None]:
MODELS_BASE_PATH = f"models"

## Train

In [None]:
get_model_dir = lambda model_name, sent: f"{MODELS_BASE_PATH}/{model_name}/model_{sent}"

In [None]:
def save_model(model_dir, nlp):
    ''' This Function Saves model to model_name dir.
    model_dir - models/[model_name]/model_pos
    '''

    if model_dir is not None:        
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        nlp.meta["name"] = model_dir.split('/')[-2]
        nlp.to_disk(model_dir)
        print(f"Saved '{model_dir}")

In [None]:
def get_training_data(sentiment, df_input):
    '''
    Returns Training data in the format needed to train spacy NER
    ID start and end point of the 'selected' text in the text 
    and used as your string entity info for spacy.
    '''
    SENTIMENT = ['negative', 'positive']
    if sentiment not in SENTIMENT:
        raise ValueError(f"{sentiment} not in {SENTIMENT})")
    train_data = []
    for index, row in df_input.iterrows():
        if row.sentiment == sentiment:
            selected_text = row.selected_text
            text = row.text
            start = text.find(selected_text)
            end = start + len(selected_text)
            train_data.append((text, {"entities": [[start, end, 'selected_text']]}))
    return train_data

In [None]:
# pass model = nlp if you want to train on top of existing model 

def train(train_data, model_dir, n_iter=20, model=None, revision_data=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    ""
    # Uses given model or instantiates a blank model.
    if model is not None:
        nlp = model
        # nlp = spacy.load(model_dir)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # Add lebels to pipeline for novel entity.
    for _, annotations in train_data:
      for ent in annotations.get("entities"):
          ner.add_label(ent[2])

    # Add in psudo rehersal data to inhibit catastrophic foregetting.
    if revision_data:
      train_data = revision_data + train_data

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()

        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))    
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,  # batch of texts
                            annotations,  # batch of annotations
                            drop=0.5,   # dropout - make it harder to memorise data
                            losses=losses, 
                            )
            print("Losses", losses)
    save_model(model_dir, nlp)

In [None]:
def run_train(model_name, n_iter=20, model=None, revision_data=None):
    """ Auto gen pos and neg models in a single call. """
    for sentiment in ['positive', 'negative']:
        model_dir = f"{MODELS_BASE_PATH}/{model_name}/model_{sentiment}"
        train_data = get_training_data(sentiment, df_train)
        train(train_data, model_dir, n_iter=n_iter, model=model, revision_data=revision_data)

In [None]:
# !rm -r "gdrive/My Drive/nlp_learn/nlp_kaggle/models/from_scratch"

## Predict

In [None]:
def predict_entities(text, model):
    doc = model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
    return selected_text

def pred_set(df_set, model_name):
    """ Run NER models on data. """
    df_pred = df_set.copy()
    
    selected_texts = []

    ## Load models.
    pdir = get_model_dir(model_name, 'positive')
    print(f"Loading Models  from - {pdir}")
    model_pos = spacy.load(pdir)
    
    ndir = get_model_dir(model_name, 'negative')
    print(f"Loading Models  from - {ndir}")
    model_neg = spacy.load(ndir)

    # Extract sent text.
    for index, row in df_pred.iterrows():
        text = row.text
        output_str = ""

        # Neutral.
        if row.sentiment == 'neutral' or len(text.split()) <= 2:
            selected_texts.append(text)
        
        # possitive & negaatives.
        elif row.sentiment == 'positive':
            selected_texts.append(predict_entities(text, model_pos))
        else:
            selected_texts.append(predict_entities(text, model_neg))

    df_pred['predicted_text'] = selected_texts

    return df_pred

## Testing

In [None]:
# Metric.
def jaccard(compare_strings): 
    str1, str2 = compare_strings
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def test_model(df_set, model_name):
  """ Input set with truth and predictions.
  Returns jaccard results in columns. 
  Adds back in neutrals and < 2 tokes.
  """

  # Extract text into predicted_text column.
  df_set = pred_set(df_set, model_name)

  # Add Neutrals back in along with their self predicted_text.
  df_neut['predicted_text'] = df_neut['selected_text']
  df_set = pd.concat([df_set, df_neut]).reset_index()

  # Get jaccard score comparing Y with predicted text.
  df_set['jaccard'] = df_set[['selected_text','predicted_text']].values.tolist()
  df_set['jaccard'] = df_set.jaccard.apply(jaccard)


  # jps_mean  = df_set.jaccard.mean()
  jpsn_mean = df_set.jaccard.mean()

  print(f"Pos+neg+neut = {jpsn_mean}")
  # print(f"Pos+neg = {jps_mean}")
  print(f"pos={df_set[df_set.sentiment=='positive'].jaccard.mean()}")
  print(f"neg={df_set[df_set.sentiment=='negative'].jaccard.mean()}")
  print(f"neut={df_set[df_set.sentiment=='neutral'].jaccard.mean()}")

  return df_set

# Training variations

## From scratch
* all testing don on the training set itself.
* Need to really test on the kaggle hidden test set via submitting.

2 epochs.

* Loading Models  from  gdrive/My Drive/nlp_learn/nlp_kaggle/models
* Pos+neg = 0.444540415582728
* pos=0.44967222034198867
* neg=0.4389391038586638
* CPU times: user 58.8 s, sys: 88.5 ms, total: 58.9 s
* Wall time: 59 s

10 epochs  (17 minutes)

* Pos+neg+neut = 0.7791013349736154
* pos=0.5068864667881017
* neg=0.48142976931119386
* neut=0.9884371171078085

20 epochs (? minutes)

* Pos+neg+neut = 0.7961334081312461
* pos=0.535177882070669
* neg=0.534812077335709
* neut=0.9884371171078085



### Train

In [None]:
MODEL_NAME = 'from_scratch'

In [None]:
%%time
run_train(MODEL_NAME, n_iter=20, model=None)

### Test

In [None]:
%%time
MODEL_NAME = 'from_scratch'
df_pred = test_model(df_train, MODEL_NAME)

In [None]:
pd.set_option("display.max_colwidth", 1000)
display(df_pred[['text','selected_text', 'sentiment', 'predicted_text', 'jaccard']][df_pred.sentiment=='negative'].sample(20))

### Submit
General notes
* Extract using the test set as this contains the text
* Then prep submit file by either dropping all other fields or adding the pred to the sumission data (test and sumission are exactly the same data.
* You must have exact same number of rows as sample submission

Colab specific notes
* Can't submit from colab via api since this is a kernal only competitions like this one.
* As such just download from colab then create a new nb in kaggle and upload the notebook into this new kaggle notebook (then adjust as needed)

Note for when submitting in kaggle - not form colab.
* Internet must be turned off.
* After saving and running you will need to go to the output notebook and tell it which file to submit.

In [None]:
!ls

In [None]:
# See test set before.
df_test.sample(2)

In [None]:
# Extract sentiment text.
MODEL_NAME = 'from_scratch'
df_test = pred_set(df_test, MODEL_NAME)
df_test.sample(10)

In [None]:
# Prep the sumission file.
# load sumission file and add extracted text to it from the df_test set.
df_submission = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
df_submission['selected_text'] = df_test['predicted_text']
print(len(df_submission))
df_submission.head(2)

In [None]:
# save file.
df_submission.to_csv("submission.csv", index=False)
!ls