In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import string

from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import stopwords
import spacy
from spacy.util import compounding
from spacy.util import minibatch
from tqdm import tqdm
import os

import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def random_colors(num_of_colors):
    colors=[]
    for i in range(num_of_colors):
        colors.append("#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
    return colors

In [None]:
df_train = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
df_test = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")

In [None]:
df_train.shape

In [None]:
df_train.info

In [None]:
df_train.head()

In [None]:
df_train['sentiment'].value_counts()

In [None]:
df_train['sentiment'].value_counts().plot.bar(color = random_colors(len(df_train['sentiment'].value_counts())))

In [None]:
df_train.isna().sum()

In [None]:
df_train.dropna(inplace=True)

In [None]:
df_train.isna().sum()

In [None]:
df_train['Num_of_words_text'] = df_train['text'].apply(lambda x : len(str(x).split()))
df_train['Num_of_words_ST'] = df_train['selected_text'].apply(lambda x : len(str(x).split()))
df_train['Difference'] = df_train['Num_of_words_text'] - df_train['Num_of_words_ST']
df_train.head()

In [None]:
def jaccard_similarity(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)/(len(a)+len(b)-len(c)))

In [None]:
jaccard_sim = []
for index, rows in df_train.iterrows():
    st1 = rows.text
    st2 = rows.selected_text
    jaccard_sim.append([st1, st2, jaccard_similarity(st1,st2)])

df_jaccard = pd.DataFrame(jaccard_sim, columns = ['text','selected_text', 'jaccard_similarity'])
df_train = df_train.merge(df_jaccard, how='left')
df_train.head()

In [None]:
plt.figure(figsize=(16,6))
p1 = sns.kdeplot(df_train[df_train['sentiment']=='positive']['Difference'], shade=True, color='y').set_title("Kernel Distribution of Difference in Number of Words(Pos/Neg)")
p2 = sns.kdeplot(df_train[df_train['sentiment']=='negative']['Difference'], shade=True, color='c')

In [None]:
plt.figure(figsize=(16,6))
p3 = sns.kdeplot(df_train[df_train['sentiment']=='neutral']['Difference'], shade=True, color='r').set_title("Kernel Distribution of Difference in Number of Words(Neutral)")

In [None]:
plt.figure(figsize=(16,6))
p1 = sns.kdeplot(df_train[df_train['sentiment']=='positive']['jaccard_similarity'], shade=True, color='y').set_title("Kernel Distribution of Jaccard Similarity(Pos/Neg)")
p2 = sns.kdeplot(df_train[df_train['sentiment']=='negative']['jaccard_similarity'], shade=True, color='g')

In [None]:
plt.figure(figsize=(16,6))
p1 = sns.kdeplot(df_train[df_train['sentiment']=='neutral']['jaccard_similarity'], shade=True, color='r').set_title("Kernel Distribution of Jaccard Similarity(Neutral)")


In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
df_train['text'] = df_train['text'].apply(lambda x : clean_text(x))
df_train['selected_text'] = df_train['selected_text'].apply(lambda x : clean_text(x))
df_train.head(10)

In [None]:
df_train['st_list'] = df_train['selected_text'].apply(lambda x : str(x).split())
df_train['text_list'] = df_train['text'].apply(lambda x : str(x).split())

def remove_stopwords(x):
    return [y for y in x if y not in stopwords.words('english')]

df_train['st_list'] = df_train['st_list'].apply(lambda x : remove_stopwords(x))
df_train['text_list'] = df_train['text_list'].apply(lambda x : remove_stopwords(x))
df_train.head()

In [None]:
'''most common words in positive sentiment selected text'''
top = Counter([item for sublist in df_train[df_train['sentiment']=='positive']['st_list'] for item in sublist])
top_pos = pd.DataFrame(top.most_common(20), columns=['Common Words', 'Count'])
top_pos.style.background_gradient(cmap='Greens')

In [None]:
'''most common words in negative sentiment selected text'''
top = Counter([item for sublist in df_train[df_train['sentiment']=='negative']['st_list'] for item in sublist])
top_neg = pd.DataFrame(top.most_common(20), columns=['Common Words', 'Count'])
top_neg.style.background_gradient(cmap='Oranges')

In [None]:
'''most common words in neutral sentiment selected text'''
top = Counter([item for sublist in df_train[df_train['sentiment']=='neutral']['st_list'] for item in sublist])
top_neu = pd.DataFrame(top.most_common(20), columns=['Common Words', 'Count'])
top_neu.style.background_gradient(cmap='Blues')

In [None]:
def unique_words(sentiment, num):
    all_other = []
    for sublist in df_train[df_train['sentiment']!=sentiment]['st_list']:
        for word in sublist:
            all_other.append(word)
    unique = Counter([word for sublist in df_train[df_train['sentiment']==sentiment]['st_list'] for word in sublist if word not in all_other])
    return pd.DataFrame(unique.most_common(num), columns=['Words','Count'])

In [None]:
unique_pos = unique_words('positive',20)
print("20 unique postive words:")
unique_pos.style.background_gradient(cmap='Greens')

In [None]:
unique_neg = unique_words('negative',20)
print("20 unique negative words:")
unique_neg.style.background_gradient(cmap='Oranges')

In [None]:
unique_neu = unique_words('neutral',20)
print("20 unique neutral words:")
unique_neu.style.background_gradient(cmap='Blues')

In [None]:
def create_wordcloud(text):
    stopwords = set(STOPWORDS)
    more_stopwords = {'u','im'}
    stopwords = stopwords.union(more_stopwords)
    wordcloud = WordCloud(background_color = 'white',
                          stopwords = stopwords,
                          max_words = 50,
                          max_font_size = 40)
    wordcloud.generate(str(text))
    plt.figure(figsize=(12,8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')

In [None]:
create_wordcloud(df_train[df_train['sentiment']=='positive']['text'])

In [None]:
create_wordcloud(df_train[df_train['sentiment']=='negative']['text'])

In [None]:
create_wordcloud(df_train[df_train['sentiment']=='neutral']['text'])

**NER Model**

https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da

https://spacy.io/usage/training#ner

https://towardsdatascience.com/train-ner-with-custom-training-data-using-spacy-525ce748fab7

In [None]:
df_train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
df_test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
df_train = df_train.dropna()

In [None]:
'''return train data in a format needed for spacy NER'''

def get_training_data(sentiment):
    train_data = []
    for index, row in df_train.iterrows():
        text = row.text
        selected_text = row.selected_text
        start = text.find(selected_text)
        end = start + len(selected_text)
        train_data.append((text, {"entities":[[start, end, 'selected_text']]}))
    return train_data

In [None]:
'''return model output path'''

def get_model_out_path(sentiment):
    model_out_path = None
    if sentiment == 'positive':
        model_out_path = 'models/model_pos'
    elif sentiment == 'negative':
        model_out_path = 'models/model_neg'
    return model_out_path

In [None]:
def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
    data (list): The data to be cleaned in spaCy JSON format.

    Returns:
    list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            # if there's preceding spaces, move the start position to nearest character
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [None]:
def train(train_data, output_dir, n_iter=20, model=None):
    train_data = trim_entity_spans(train_data)
    if model is not None:
        nlp = spacy.load(model)  
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        
        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                try:
                    nlp.update(
                        [text],  
                        [annotations],  
                        drop=0.2,  
                        sgd=optimizer,  
                        losses=losses)
                except Exception as error:
                    continue
            print(losses)
    save_model(output_dir, nlp, 'st_ner')

In [None]:
def save_model(output_dir, nlp, new_model_name):
    ''' This Function Saves model to 
    given output directory'''
    
    output_dir = f'../working/{output_dir}'
    if output_dir is not None:        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        nlp.meta["name"] = new_model_name
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [None]:
sentiment = 'positive'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
train(train_data, model_path, n_iter=2, model=None)

In [None]:
sentiment = 'negative'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
train(train_data, model_path, n_iter=2, model=None)

In [None]:
def predict_entities(text, model):
    doc = model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
    return selected_text

In [None]:
selected_texts = []
MODELS_BASE_PATH = 'models/'

if MODELS_BASE_PATH is not None:
    print("Loading Models  from ", MODELS_BASE_PATH)
    model_pos = spacy.load(MODELS_BASE_PATH + 'model_pos')
    model_neg = spacy.load(MODELS_BASE_PATH + 'model_neg')
        
    for index, row in df_test.iterrows():
        text = row.text
        output_str = ""
        if row.sentiment == 'neutral' or len(text.split()) <= 2:
            selected_texts.append(text)
        elif row.sentiment == 'positive':
            selected_texts.append(predict_entities(text, model_pos))
        else:
            selected_texts.append(predict_entities(text, model_neg))
        
df_test['selected_text'] = selected_texts

In [None]:
df_submission = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')

In [None]:
df_submission.shape

In [None]:
df_test.shape

In [None]:
df_submission['selected_text'] = df_test['selected_text']
df_submission.to_csv("submission.csv", index=False)
display(df_submission.head(10))