In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import sys # to get error message when exception occurs.
import re
import datetime as dt

import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch
from spacy.util import decaying
from spacy import displacy

import warnings
warnings.filterwarnings("ignore")

In [None]:
#activated = spacy.require_gpu()#prefer_gpu()
#print(f'is GPU activited for spacy: {activated}')

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Load train and test data
train_data = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
test_data = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')

print(f'train_data.shape: {train_data.shape}')
print(f'test_data.shape: {test_data.shape}')
train_data.head(10)

In [None]:
test_data.head()

In [None]:
#Train data: Check for null entries
train_data.isnull().mean()

In [None]:
#Test data: Check for null entries
test_data.isnull().mean()

In [None]:
#train_data contains 1 null entry thefore it's safe to drop it.
train_data[train_data.text.isnull() == True]

In [None]:
train_data.dropna(inplace=True)
train_data.reset_index(inplace=True) #reset index post dropping NA
len(train_data)

In [None]:
train_data.sentiment.value_counts()

In [None]:
print(f'train_data sentiment unique entries: {train_data.sentiment.unique()}')
print(f'test_data sentiment unique entries: {test_data.sentiment.unique()}')

In [None]:
#Validate selected_text match with text in train_data

validIdx=[]
invalidIdx=[]
for row in train_data.index.tolist():
    try:
        if train_data.iloc[row].selected_text in train_data.iloc[row].text:
            validIdx.append(row)
        else:
            invalidIdx.append(row)
        #break
    except:
        e = sys.exc_inf()[0]
        print(e)
        print(train_data.iloc[row].text)

len(validIdx), len(invalidIdx)

In [None]:
#train_data['neutral_sel_tx_diff'] = train_data.loc[lambda d: d.sentiment == 'neutral'][['text', 'selected_text']].apply(lambda d: len(d.text) - len(d.selected_text))

f1 = lambda d: d.sentiment == 'neutral'
#f2 = lambda d: (len(d.text) - len(d.selected_text)) > 0
#train_data.loc[lambda d: d.neutral_sel_tx_diff > 0][['text', 'selected_text']]
train_data.loc[f1]

train_data['txt_and_sel_txt_diff'] = [(len(d.text) - len(d.selected_text)) for d in train_data.itertuples()]
train_data['txt_and_sel_txt_diff_strip'] = [(len(d.text.strip()) - len(d.selected_text.strip())) for d in train_data.itertuples()]


In [None]:
lst = train_data.loc[lambda d: d.sentiment == 'neutral'].loc[lambda d: d.txt_and_sel_txt_diff > 0].txt_and_sel_txt_diff.value_counts()
lst_strip = train_data.loc[lambda d: d.sentiment == 'neutral'].loc[lambda d: d.txt_and_sel_txt_diff_strip > 0].txt_and_sel_txt_diff_strip.value_counts()

In [None]:
lst[lst > 10]

In [None]:
lst_strip[lst_strip > 3]

In [None]:
train_data.loc[lambda d: d.sentiment == 'neutral'].loc[lambda d: d.txt_and_sel_txt_diff > 10]

In [None]:
train_data.loc[f1].loc[lambda d: d.txt_and_sel_txt_diff_strip > 10]

In [None]:
train_data.loc[f1].loc[lambda d: d.txt_and_sel_txt_diff_strip > 1].loc[lambda d: d.text.str.find('http') > -1]

In [None]:
def find_url(string): 
  
    # findall() has been used  
    # with valid conditions for urls in string 
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    url = re.findall(regex,string)       
    return [x[0] for x in url] 

def replace_url(txt):
    dat = txt
    for s in find_url(txt):
        dat = dat.replace(s,"")
    return dat

print(replace_url('Hello this www.google.com is google website!!!'))

def cust_strip(df, lst):
    for col in lst:
        df[col] = df[col].str.strip()

#Remove space at the begining of text and selected_text.
cust_strip(train_data, ['text', 'selected_text'])

#Remove url as they are not part of selected text as per train data
#train_data.text = train_data.text.apply(replace_url)

train_data.head()

In [None]:
#Remove space at the begining of text and selected_text.
cust_strip(test_data, ['text'])

#Remove url as they are not part of selected text as per train data
#test_data.text = test_data.text.apply(replace_url)
test_data.head()

In [None]:
#Max len of negative and positive sentiment
print(train_data['text'].str.len().max(), 
    train_data.loc[lambda d: d.sentiment == 'neutral']['selected_text'].str.len().max(),
    train_data.loc[lambda d: d.sentiment == 'positive']['selected_text'].str.len().max(),
    train_data.loc[lambda d: d.sentiment == 'negative']['selected_text'].str.len().max())

#Max len of negative and positive sentiment
print(train_data['text'].str.len().min(), 
    train_data.loc[lambda d: d.sentiment == 'neutral']['selected_text'].str.len().min(),
    train_data.loc[lambda d: d.sentiment == 'positive']['selected_text'].str.len().min(),
    train_data.loc[lambda d: d.sentiment == 'negative']['selected_text'].str.len().min())

In [None]:
train_data.loc[lambda d: d.selected_text.str.len() <3]

In [None]:
#Function prepares data for given row for Spacy model.

def parse_data(df, idx,lFlag=False):
    txt = df.iloc[idx].text.lower()
    sel_txt = df.iloc[idx].selected_text.lower()
    senti = df.iloc[idx].sentiment.lower()
    parsedTxt = ''
    
    if lFlag == True: # Logging
        print(f'row: {idx}')
        print(f'text:{txt}')
        print(f'sel_txt:{sel_txt}')
    
    if sel_txt in txt:
        start = txt.index(sel_txt)
        end = start+len(sel_txt)
        entityTuple = (start, end, senti)
        parsedTxt = (txt, {'entities': [entityTuple]})
    
    if lFlag == True : #Logging
        s = parsedTxt[1]["entities"][0][0]
        e = parsedTxt[1]["entities"][0][1]
        print(f'sel_txt and parsedTxt matched => {txt[s:e] == sel_txt}')
    
    return parsedTxt 

#Validate function parse_train_data 
for row in range(5):
    print(parse_data(train_data, row))
    print()


#Function to create spacy blank model and add custom labels to ner.
def create_blank_nlp(parsed_data):
    nlp = spacy.blank('en')
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
    ner = nlp.get_pipe('ner')
    for _, annotations in parsed_data:
        if len(annotations.get("entities")[0]) == 3:
            #print(annotations.get("entities"))
            ner.add_label(annotations.get("entities")[0][2])
    print(f'These labels added to ner: {ner.labels}')
    return nlp

 
def train_model(df, model, epoch=20, minBatch=4.0, maxBatch=16.0, lr=1.01, drop=0.5, 
                enableBatchScheme=False, batchScheme=[100,200,300,400,500]):
    #print(f'nlp pipeline: {nlp.pipeline}')
    #dropout=decaying(0.6,0.1,1e-4)
    nextBatchIdx = 0
    losses_output = []
    optimizer = model.begin_training()
    for i in range(epoch):
        start_dt = dt.datetime.now()
        random.shuffle(df)
        losses = {}
        if enableBatchScheme and epoch >= len(batchScheme):
            cnt = i%(epoch/len(batchScheme))
            if cnt == 0:
                minBatch = batchScheme[nextBatchIdx]
                maxBatch = batchScheme[nextBatchIdx]
                nextBatchIdx += 1
                #print(f'mini and max batch is: {minBatch}')
        batches = minibatch(df, size=compounding(minBatch, maxBatch, lr))
        for batch in batches:
            txt, annotations = zip(*batch)
            model.update(txt, annotations, sgd=optimizer, 
                         #drop=next(dropout), 
                         drop=drop,
                         losses=losses)
            end_dt = dt.datetime.now()
            diff = end_dt - start_dt
        losses_output.append(losses['ner'].max())
        #print(f'{i}: Losses - {losses} - epoch took {diff}')
    #print(f'losses_output: {losses_output}')
    fig = plt.figure(figsize=[20,5])
    ax = plt.axes()
    x = [x_i for x_i in range(len(losses_output))]
    z = np.polyfit(x,losses_output,3)
    p = np.poly1d(z)
    ax.plot(x,p(x), 'r--')
    ax.plot(x, losses_output)
    print(f'final losses: {x[len(x)-1]}')
        

def create_model(df):
    df.reset_index(drop=True, inplace=True)
    parse_dt = [parse_data(df,row) for row in df.index.tolist()] 
    return create_blank_nlp(parse_dt), parse_dt

models = {}
def collect_model(key, model):
    models[key] = model
    print(f'Model added for {key} sentiment.')
    
def get_doc(txt, sentiment, is_lower=True):
    model = models[sentiment]
    if is_lower:
        return model(txt.lower())
    else:
        return model(txt) 
    

#Method to evaluate for submission
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
#redudent
#parsed_train_data = [parse_data(train_data,row) for row in train_data.index.tolist()] 
#nlp = create_blank_nlp(parsed_train_data)

#Visual inpection of parsed_train_data for word 'happy'. it returns ('text', 'annotation', 'selected_text')
#parsed_train_data[0:5]
search_word='happy'
#g = ((dt, anno, dt[anno['entities'][0][0]:anno['entities'][0][1]]) for dt, anno in parsed_train_data if search_word in dt.lower())

#for i in range(5):
#    print(next(g))

#train_model(parsed_train_data,nlp,70)

#nlp.to_disk('/kaggle/working/my_model')
#nlp = nlp.from_disk('/kaggle/working/my_model')

In [None]:
%%time
#Create Seaparate Models each for one sentiment
#nlp_neg, parse_dt_neg = create_model(train_data.loc[lambda d: d.sentiment == 'negative'])
#nlp_pos, parse_dt_pos = create_model(train_data.loc[lambda d: d.sentiment == 'positive'])
#nlp_nu, parse_dt_nu = create_model(train_data.loc[lambda d: d.sentiment == 'neutral'])
#collect_model('negative', nlp_neg)
#collect_model('positive', nlp_pos)
#collect_model('neutral', nlp_nu)

#Create single model for all sentiment
nlp_all , parse_dt_all = create_model(train_data)
collect_model('all', nlp_all)

In [None]:
#Visual inpection of train_data for search_word
train_data.loc[lambda d: d['text'].str.lower().str.contains(search_word)].head(10)

In [None]:
custBatchScheme = [(cnt+1)*100 for cnt in range(10)]
#custBatchScheme

In [None]:
#%%time
#train_model(parse_dt_neg,nlp_neg, 50, enableBatchScheme=True, lr=1.1, batchScheme=custBatchScheme)

In [None]:
#%%time
#train_model(parse_dt_pos,nlp_pos, 50, enableBatchScheme=True, lr=1.1, batchScheme=custBatchScheme)

In [None]:
#%%time
#train_model(parse_dt_nu,nlp_nu, 50, enableBatchScheme=True, lr=1.1, batchScheme=custBatchScheme)

In [None]:
%%time
train_model(parse_dt_all,nlp_all, 300, enableBatchScheme=True, lr=1.1, batchScheme=custBatchScheme)

In [None]:
#Validate to see if NER is working as expected

#doc = nlp_neg('Sooo SAD I will miss you here in San Diego!!!'.lower())
doc = nlp_all('Sooo SAD I will miss you here in San Diego!!!'.lower())
displacy.render(doc, style='ent')

In [None]:
#Validate randon 25 records to see if NER is working as expected 
for i in range(25):
    idx = i#random.choices(train_data.index)[0]
    doc = nlp_all(train_data.iloc[idx].text.lower())
    #doc = get_doc(train_data.iloc[idx].text, train_data.iloc[idx].sentiment)
    displacy.render(doc, style='ent')
    print(f'row index: {idx}')
    print(f'predicted selected_text: {doc.ents}')
    print(f'   actual selected_text: {train_data.iloc[idx].selected_text}')
    senti = ()
    if len(doc.ents) > 0:
        senti = doc.ents[0].label_
    print(f'predicted sentiment: {senti}')
    print(f'   actual sentiment: {train_data.iloc[idx].sentiment}')


In [None]:
#Prediction with nlp model with all entities i.e. neutral AND positive AND negative
test_data.head()
test_data["predict"] = ''
test_data['txt_predict']=''
test_data['is_predicted'] = 0
for row in test_data.index.tolist():
    doc = nlp_all(test_data.iloc[row].text.lower())
    identified_senti = ''
    senti = ''
    txt_predict = ''
    is_predi = 0
    if len(doc.ents) > 0:
        all_senti = {}
        #print(f' doc.ents: {doc.ents}')
        len_doc_ent = len(doc.ents)
        all_senti = {doc.ents[idx].label_: idx for idx in range(len(doc.ents))}
        if 'negative' in all_senti:
            senti = 'negative'
        elif 'positive' in all_senti:
            senti = 'positive'
        else:
            senti = 'neutral'
        is_predi = 1
        txt_predict = doc.ents[all_senti[senti]].text
    test_data["predict"].iloc[row] = senti
    test_data["txt_predict"].iloc[row] = txt_predict
    test_data["is_predicted"].iloc[row] = is_predi
    

In [None]:
doc = get_doc('I know him. he is good guy!', 'positive')
displacy.render(doc, style='ent')
len(doc.ents)
doc.ents[0].label_
doc.ents[0].text

In [None]:
#Prediction with individual nlp model with entity neutral OR positive OR negative
# test_data.head()
# test_data["predict"] = ''
# test_data['txt_predict']=''
# test_data['is_predicted'] = 0
# for row in test_data.index.tolist():
#     doc = get_doc(test_data.iloc[row].text, test_data.iloc[row].sentiment)
#     test_data["is_predicted"].iloc[row] = 0
#     test_data["predict"].iloc[row] = 'cannot_predict'
#     if len(doc.ents) > 0:
#         test_data["predict"].iloc[row] = doc.ents[0].label_
#         test_data["txt_predict"].iloc[row] = doc.ents[0].text
#         test_data["is_predicted"].iloc[row] = 1

In [None]:
d = test_data[test_data["is_predicted"] == 1]
d.head()

In [None]:
print(f'            sentiment: {test_data.sentiment.unique()}')
print(f'              predict: {test_data.predict.unique()}')
total_rec = len(test_data)
cnt = len(test_data[test_data.is_predicted == 0])
print(f'   count of test_data: {total_rec}')
print(f'couldnt predict count: {cnt}')
cnt_percent = cnt/total_rec
print(f'couldnt predict count: {cnt_percent}')

In [None]:
confusion_matrix(test_data.sentiment, test_data.predict)

In [None]:
print(classification_report(test_data.sentiment, test_data.predict))

# Sentiment Analysis using VADER (Valance Aware Dictionary for Sentiment Reasoning)

In [None]:
pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
def v_sentiment(x):
    return analyzer.polarity_scores(x)

def v_sentiment_class(x):
    if x['compound'] <= -0.05:
        return 'negative'
    elif x['compound'] <= 0.05:
        return 'neutral'
    else:
        return 'positive'

nlp1 = spacy.load('en_core_web_sm')
def rem_stop_word(txt):
    return ' '.join([word.text for word in nlp1(txt) if nlp1.vocab[word.text].is_stop == False])
    

In [None]:
#Prediction - with stop words

test_data['v_senti_sw'] = test_data['text'].apply(v_sentiment)
test_data['v_senti_sw_class'] = test_data['v_senti_sw'].apply(v_sentiment_class)
test_data['v_senti_sw_c_score'] = [txt['compound'] for txt in test_data['text'].apply(v_sentiment)]
test_data.head()


In [None]:
test_data.sentiment.unique()

In [None]:
test_data.v_senti_sw_class.unique()

In [None]:
confusion_matrix(test_data.sentiment, test_data.v_senti_sw_class)

In [None]:
#Prediction - including stop words
print(classification_report(test_data.sentiment, test_data.v_senti_sw_class))

In [None]:
# find out data for which sentiment and vader prediction doesn't match
df = test_data.loc[lambda d: d['sentiment'] != d['v_senti_sw_class']]
df.head()

In [None]:
#1230 records sentiment doesn't match with vader prediction
df.sentiment.value_counts()

In [None]:
#Neutral sentiment (given data) was predicted incorrect by vader 
df.loc[lambda d: d.sentiment == 'neutral'].v_senti_sw_class.value_counts()

In [None]:
#Positive sentiment (given data) was predicted incorrect by vader 
df.loc[lambda d: d.sentiment == 'positive'].v_senti_sw_class.value_counts()

In [None]:
#Negative sentiment (given data) was predicted incorrect by vader 
df.loc[lambda d: d.sentiment == 'negative'].v_senti_sw_class.value_counts()

In [None]:
#Sample of data in which given sentiment wasn't predicted by vader correctly
test_data[test_data.textID == '00d5195223']

In [None]:
#Prediction - without stop words

test_data['v_senti_nsw'] = test_data['text'].apply(rem_stop_word).apply(v_sentiment)
test_data['v_senti_nsw_class'] = test_data['v_senti_nsw'].apply(v_sentiment_class)
test_data['v_senti_nsw_c_score'] = [txt['compound'] for txt in test_data['text'].apply(rem_stop_word).apply(v_sentiment)]
test_data.head()

In [None]:
#Matrix between vader prediction "with" and vader prediction "without" stop word.
confusion_matrix(test_data.v_senti_sw_class, test_data.v_senti_nsw_class)

In [None]:
#Classification report between vader prediction "with" and vader prediction "without" stop word.
print(classification_report(test_data.v_senti_sw_class, test_data.v_senti_nsw_class))

In [None]:
#Classification report between given sentiment and vader prediction "with" stop word.
print(classification_report(test_data.sentiment, test_data.v_senti_sw_class))

In [None]:
#Classification report between given sentiment and vader prediction "without" stop word.
print(classification_report(test_data.sentiment, test_data.v_senti_nsw_class))

In [None]:
#Classification report between given sentiment and spacy ner prediction "with" stop word.

print(classification_report(test_data.sentiment, test_data.predict))

In [None]:
sub_df = test_data[['textID','txt_predict']]
sub_df.columns = [['textID', 'selected_text']]
sub_df.to_csv('/kaggle/working/submission.csv', index=False, header=True)
print("done")

In [None]:
test_data.loc[test_data.textID =='1fa8e6ad66']

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session