In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Step1: Let us start by taking input of the training set and exploring it. Like the null values and the distribution.

In [None]:
train_df = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
train_df.head(5)

Here text is the full length tweet, sentiment is the classification and selected_text is the text that leads to the given sentiment.

In [None]:
train_df.info()

Step2: It can be seen that there is a row with null values, since it is a single row, it can be deleted.

In [None]:
train_df.dropna(inplace=True)

Step3: Now to find the relation between the text and the selected_text, we can compute the jacquard similarity between the 2 texts. It is nothing but the size of intersection divided by the size of union of the 2 texts.

In [None]:
def jacquard_f(text1, text2):
    text1 = set(text1.lower().split())
    text2 = set(text2.lower().split())
    inter = text1.intersection(text2)
    return len(inter)/(len(text1) + len(text2) - len(inter))
    

In [None]:
jacquard_values = [] 
for ind, row in train_df.iterrows():
    s1 = row.text
    s2 = row.selected_text
    jacquard_values.append([s1, s2, jacquard_f(s1, s2)])
jacquard = pd.DataFrame(jacquard_values, columns=["text","selected_text","jac"])
train_df = train_df.merge(jacquard, how="outer",on="text")
train_df.head(3)

Step4: Let us plot the jacqauard similarity for the different categories of classification i.e. neutral and positive. We use the kde plot, that is the kernel distribution estimate plot. It is similar to histogram.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

p1=sns.kdeplot(train_df[train_df['sentiment']=='positive']['jac'], shade=True, color="r")
p2=sns.kdeplot(train_df[train_df['sentiment']=='negative']['jac'], shade=True, color="b")
p3=sns.kdeplot(train_df[train_df['sentiment']=='neutral']['jac'], shade=True, color="g")

It can be seen that there are peaks towards 1, i.e. high similarity between text and selected text. We can device a method working on these texts that have high similarity, as selected text would be similar to given text. One particular case would be when there are less number of words in text, there is high chances that selected text would be same as text.

Step4: To work on texts with less words, we need a column that tells the number of words used.

In [None]:
train_df['num_words_text']= train_df['text'].apply(lambda x: len(str(x).split()))

Step5: Let us see the jacquard similarity for texts with few words

In [None]:
less_three = train_df[train_df['num_words_text']<=2]
less_three.groupby('sentiment').mean()['jac']
less_three.head(5)

It can be seen that text and selected_text is the same. Now lets see what about tweets with more than 3 words. Let us clean the text, it is very important to get good models. Junk data will reduce efficiency of the models.

In [None]:
from nltk.corpus import stopwords
stopword = stopwords.words('english')

In [None]:
import re
#
import string
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = text.split()
    words = [t for t in text if t not in stopword]
    return words
train_df['list_words'] = train_df['text'].apply(lambda x:clean_text(x))
train_df['list_words_selected'] = train_df['selected_text_x'].apply(lambda x:clean_text(x))

In [None]:
train_df['list_words'].head(3)

Step6: Let us obtain the most common words in the selected  text and text, as they might play greater role in the process

In [None]:
from collections import Counter
#train_df['list_words']=train_df['selected_text_x'].apply(lambda x:str(x).split())
top_words_text = Counter([item for sublist in train_df['list_words'] for item in sublist])
#train_df['list_words_text']=train_df['text'].apply(lambda x:str(x).split())
top_words_selected_text = Counter([item for sublist in train_df['list_words_selected'] for item in sublist])

Step7: now let us see the common words for each caetegory i.e. positive and negative

In [None]:
positives = train_df[train_df['sentiment']=='positive']
negatives = train_df[train_df['sentiment']=='negative']
neutrals = train_df[train_df['sentiment']=='neutral']

In [None]:
top = Counter([item for sublist in positives['list_words'] for item in sublist])
temp_positive = pd.DataFrame(top.most_common(20))
temp_positive.columns = ['Common_words','count']
temp_positive

In [None]:
top = Counter([item for sublist in negatives['list_words'] for item in sublist])
temp_negative = pd.DataFrame(top.most_common(20))
temp_negative.columns = ['Common_words','count']
temp_negative

In [None]:
top = Counter([item for sublist in neutrals['list_words'] for item in sublist])
temp_neutral = pd.DataFrame(top.most_common(20))
temp_neutral.columns = ['Common_words','count']
temp_neutral

In [None]:
import plotly.express as px
fig = px.treemap(temp_positive, path=['Common_words'], values='count',title='Common Postive Words')
fig.show()
fig = px.treemap(temp_negative, path=['Common_words'], values='count',title='Common Negative Words')
fig.show()
fig = px.treemap(temp_neutral, path=['Common_words'], values='count',title='Common Neutral Words')
fig.show()

However, it can be seen that, some common words are across all the categories, it would be more meaningful if we could have words that are specific to certain categories, so that they can strongly determine the sentiment

In [None]:
def get_unique_words(sentiment,numwords,raw_words):
    #Get unique words belonging to categories other than given sentiment
    other_words = []
    for item in train_df[train_df.sentiment != sentiment]['list_words']:
        for word in item:
            other_words.append(word)
    other_words= list(set(other_words))
    category_words = [x for x in raw_words if x not in other_words]
    newcounter = Counter()
    for item in train_df[train_df.sentiment == sentiment]['list_words']:
        for word in item:
            newcounter[word] += 1
    keep = list(category_words)
    for word in list(newcounter):
        if word not in keep:
            del newcounter[word]
    unique_words = pd.DataFrame(newcounter.most_common(numwords), columns = ['words','count'])
    return unique_words

In [None]:
raw_text = [word for word_list in train_df['list_words'] for word in word_list]
unique_positive= get_unique_words('positive', 20, raw_text)
unique_negative= get_unique_words('negative', 20, raw_text)
unique_neutral= get_unique_words('neutral', 20, raw_text)
unique_positive

Step8: Given these strong words, let us train models for the given task. First let us try NER i.e. Names Entity recognition. Let us begin by creating a model for positive sentiments. Our data must be converted to entities in order to model it as a NER problem. In particular, we consider only those data for which number of words>3. For the rest, the selected text can be considered as text.

In [None]:
train_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
test_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
train_df['text_length'] = train_df['text'].apply(lambda x:len(str(x).split())) 
train_df = train_df[train_df['text_length']>=3]

In [None]:
def format_data(sentiment):
    formatted_data = []
    for index, row in train_df.iterrows():
        if row.sentiment == sentiment:
            selected_text = row.selected_text
            text = row.text
            start = text.find(selected_text)
            end = start + len(selected_text)
            formatted_data.append((text, {"entities": [[start, end, 'selected_text']]}))
    return formatted_data

Step9: We train 2 nlp pipeline that performs NER for positive texts and negative texts

In [None]:
def train(train_data, output_path, n_iter=20, model=None):
    if model is not None:
        nlp = spacy.load(output_path) 
    else:
        nlp = spacy.blank("en")
    
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")
    
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes): 
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()


        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))    
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,  # batch of texts
                            annotations,  # batch of annotations
                            drop=0.5,   # dropout - make it harder to memorise data
                            losses=losses, 
                            )
            print("Losses", losses)
    nlp.meta["name"] = "st_ner"
    nlp.to_disk(output_path)

In [None]:
import spacy
from tqdm import tqdm
import random
from spacy.util import minibatch, compounding
sentiment = 'positive'
train_data_positive = format_data(sentiment)
train(train_data_positive, 'positive', n_iter=3, model=None)

In [None]:
sentiment = 'negative'
train_data_negative = format_data(sentiment)
train(train_data_negative, 'negative', n_iter=3, model=None)

Step 10: Using the trained model, we predict the named entity selected_text for each text

In [None]:
def predict_entities(text, model):
    doc = model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
    return selected_text

In [None]:
predicted_selected_text = []
model_pos = spacy.load('positive')
model_neg = spacy.load('negative')
        
for index, row in test_df.iterrows():
    text = row.text
    output_str = ""
    if row.sentiment == 'neutral' or len(text.split()) <= 2:
        predicted_selected_text.append(text)
    elif row.sentiment == 'positive':
        predicted_selected_text.append(predict_entities(text, model_pos))
    else:
        predicted_selected_text.append(predict_entities(text, model_neg))
        
test_df['selected_text'] = predicted_selected_text

Step 11: we submit the predicted selected texts!!

In [None]:
submission_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')
submission_df['selected_text'] = test_df['selected_text']
submission_df.to_csv("submission.csv", index=False)
submission_df.head(10)
