In [None]:
import pandas as pd
import os
import re
import string
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.figure_factory as ff
from plotly.offline import iplot
from collections import Counter
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
from nltk.corpus import stopwords
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))





In [None]:
train_data=pd.read_csv(r"../input/tweet-sentiment-extraction/train.csv")
test_data=pd.read_csv(r"../input/tweet-sentiment-extraction/test.csv")
train_data

In [None]:
test_data.head()

In [None]:
test_data.describe()

In [None]:
train_data.describe()

In [None]:
print(train_data.isnull().sum())
test_data.isnull().sum()

In [None]:
train_data.dropna()
train_data.head()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='sentiment',data=train_data)
plt.figure(figsize=(12,6))
sns.countplot(x='sentiment',data=test_data)

In [None]:

fig = make_subplots(rows=2, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}],[{'type': 'domain'}, {'type': 'domain'}]])
fig.add_trace(go.Pie(labels=train_data['sentiment'],hole=.3,title = {"position": "top center", "text":"Donut_chart of Sentiment Distribution(tain data)"}),1, 1)
fig.add_trace(go.Funnelarea(labels=train_data['sentiment'],title = {"position": "top center", "text":"Funnel-Chart of Sentiment Distribution(train data)"}),1, 2)
fig.add_trace(go.Pie(labels=test_data['sentiment'],hole=.3,title = {"position": "top center", "text":"Donut_chart of Sentiment Distribution(test data)"}),2, 1)
fig.add_trace(go.Funnelarea(labels=test_data['sentiment'],title = {"position": "top center", "text":"Funnel-Chart of Sentiment Distribution(test data)"}),2, 2)


In [None]:
def jaccard(str1, str2): 
    a = set(str(str1).lower().split()) 
    b = set(str(str2).lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
results=[]
for ind,row in train_data.iterrows():
    sentence1 = row.text
    sentence2 = row.selected_text
    jaccard_score = jaccard(sentence1,sentence2)
    results.append(jaccard_score)
    

In [None]:
train_data[' jaccard_score ' ]=np.array(results)
train_data['Num_words_ST'] = train_data['selected_text'].apply(lambda x:len(str(x).split())) 
train_data['Num_word_text'] = train_data['text'].apply(lambda x:len(str(x).split())) 
train_data['difference_in_words'] = train_data['Num_word_text'] - train_data['Num_words_ST'] 
train_data.head()

In [None]:
train_data.columns

In [None]:
k=train_data.groupby('sentiment')[' jaccard_score '].mean()
k*100

In [None]:
hist_data = [train_data['Num_words_ST'],train_data['Num_word_text']]
group_labels =['Selected_Text', 'Text']
# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels,show_curve=False)
fig.update_layout(title_text='Distribution of Number Of words')
fig.show()
plt.figure(figsize=(12,6))
p1=sns.kdeplot(train_data['Num_words_ST'], shade=True, color="r").set_title('Kernel Distribution of Number Of words')
p1=sns.kdeplot(train_data['Num_word_text'], shade=True, color="b")

In [None]:
plt.figure(figsize=(12,6))
p1=sns.kdeplot(train_data[train_data['sentiment']=='positive']['difference_in_words'], shade=True, color="b").set_title('Kernel Distribution of Difference in Number Of words')
p2=sns.kdeplot(train_data[train_data['sentiment']=='negative']['difference_in_words'], shade=True, color="r")


In [None]:
plt.figure(figsize=(12,6))
p1=sns.kdeplot(train_data[train_data['sentiment']=='positive'][' jaccard_score '], shade=True, color="b").set_title('Kernel Distribution of Difference in jaccard_score')
p2=sns.kdeplot(train_data[train_data['sentiment']=='negative'][' jaccard_score '], shade=True, color="r")


In [None]:
train_data['tar_p']=train_data['text'].str.lower()
train_data.head()

In [None]:
def find_punct(text):
    line = re.findall(r'[!"\$%&\'()*+,\-.\/:;=#@?\[\\\]^_`{|}~]*', text)
    string="".join(line)
    return list(string)

In [None]:
train_data['target_punct']=train_data['tar_p'].apply(lambda x:find_punct(str(x)))
train_data['target_punct_len']=train_data['tar_p'].apply(lambda x:len(find_punct(str(x))))

In [None]:
postive=train_data[train_data['sentiment']=='positive']
negative=train_data[train_data['sentiment']=="negative"]
netural=train_data[train_data['sentiment']=="neutral"]

In [None]:
punc_df=pd.DataFrame(train_data,columns=['target_punct','sentiment'])
punc_df=punc_df[punc_df['target_punct'].map(lambda d: len(d)) > 0]
punc_df=punc_df.explode('target_punct')

positive_df=pd.DataFrame(punc_df.loc[punc_df['sentiment']=="positive"]['target_punct'].value_counts()).reset_index().rename(columns={'index': 'punct','target_punct':'pos_punct'})
negative_df=pd.DataFrame(punc_df.loc[punc_df['sentiment']=="negative"]['target_punct'].value_counts()).reset_index().rename(columns={'index': 'punct','target_punct':'neg_punct'})
neutral_df=pd.DataFrame(punc_df.loc[punc_df['sentiment']=="neutral"]['target_punct'].value_counts()).reset_index().rename(columns={'index': 'punct','target_punct':'neut_punct'})


In [None]:
fig = make_subplots(rows=1, cols=3)

fig.append_trace(go.Bar(x=positive_df.punct[:10],y=positive_df.pos_punct[:10],name='Positive',marker_color='green'), row=1, col=1)
fig.append_trace(go.Bar(x=negative_df.punct[:10],y=negative_df.neg_punct[:10],name='Negative',marker_color='red'), row=1, col=2)
fig.append_trace(go.Bar(x=neutral_df.punct[:10],y=neutral_df.neut_punct[:10],name='Neutral',marker_color='orange'), row=1, col=3)

fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_layout(title_text="Selected Text - Sentiment vs Punctuation",title_x=0.5)
fig.show()


In [None]:
train=train_data[['textID', 'text', 'selected_text', 'sentiment']]
train.head()

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
train['text'] = train['text'].apply(lambda x:clean_text(str(x)))
train['selected_text'] = train['selected_text'].apply(lambda x:clean_text(str(x)))

In [None]:
train['text_len'] = train['text'].astype(str).apply(len)
train['text_word_count'] = train['text'].apply(lambda x: len(str(x).split()))
test_data["text_len"]=test_data["text"].astype(str).apply(len)
test_data['text_word_count'] = test_data['text'].apply(lambda x: len(str(x).split()))

In [None]:
fig = px.histogram(train, x=train['text_len'], color=train['sentiment'],title = "Length of the text(train)")
fig.show()
fig = px.histogram(train, x=test_data['text_len'], color=test_data['sentiment'],title = "Length of the text(test)")
fig.show()

In [None]:
fig = px.histogram(train, x=train['text_word_count'], color=train['sentiment'],title = "word count(train)")
fig.show()
fig = px.histogram(train, x=test_data['text_word_count'], color=test_data['sentiment'],title = "word count(test)")
fig.show()

In [None]:
train['temp_list'] = train['text'].apply(lambda x:str(x).split())
top = Counter([item for item in train['temp_list'] for item in item ])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']

In [None]:
fig = px.bar(temp, x="count", y="Common_words", title='Commmon Words in Selected Text', orientation='h',color='Common_words')
fig.show()
fig = px.treemap(temp, path=['Common_words'], values='count',title='Tree Of Most Common Words')
fig.show()

In [None]:
def remove_stopword(x):
    return [y for y in x if y not in stopwords.words('english')]
train['temp_list'] = train['temp_list'].apply(lambda x:remove_stopword(x))

In [None]:
postive=train[train['sentiment']=='positive']
negative=train[train['sentiment']=='negative']
neutral=train[train['sentiment']=='neutral']


In [None]:
top = Counter([item for sublist in neutral['temp_list'] for item in sublist])
temp_neutral = pd.DataFrame(top.most_common(20))
temp_neutral.columns = ['Common_words','count']

In [None]:
fig = px.bar(temp_neutral, x="count", y="Common_words", title='Commmon Words in neutral Text', orientation='h',color='Common_words')
fig.show()
fig = px.treemap(temp_neutral, path=['Common_words'], values='count',title='Tree Of Most Common neutral Words')
fig.show()
fig = go.Figure(data=[go.Pie(labels=temp_neutral['Common_words'], values=temp_neutral['count'], hole=.7)])
fig.update_layout(
    title_text='Tree Of Most Common neutral Words',
    annotations=[dict(text=' neutral ', x=0.5, y=0.5, font_size=20, showarrow=False)])
fig.show()


In [None]:
top = Counter([item for sublist in negative['temp_list'] for item in sublist])
temp_negative = pd.DataFrame(top.most_common(20))
temp_negative.columns = ['Common_words','count']

In [None]:
fig = px.bar(temp_negative, x="count", y="Common_words", title='Commmon Words in negative Text', orientation='h',color='Common_words')
fig.show()
fig = px.treemap(temp_negative, path=['Common_words'], values='count',title='Tree Of Most Common negative Words')
fig.show()
fig = go.Figure(data=[go.Pie(labels=temp_negative['Common_words'], values=temp_negative['count'], hole=.7)])
fig.update_layout(
    title_text='Tree Of Most Common negative Words',
    annotations=[dict(text=' negative ', x=0.5, y=0.5, font_size=20, showarrow=False)])
fig.show()

In [None]:
top = Counter([item for sublist in postive['temp_list'] for item in sublist])
temp_positive = pd.DataFrame(top.most_common(20))
temp_positive.columns = ['Common_words','count']

In [None]:
fig = px.bar(temp_positive, x="count", y="Common_words", title='Commmon Words in positive Text', orientation='h',color='Common_words')
fig.show()
fig = px.treemap(temp_positive, path=['Common_words'], values='count',title='Tree Of Most Common positive Words')
fig.show()
fig = go.Figure(data=[go.Pie(labels=temp_positive['Common_words'], values=temp_positive['count'], hole=.7)])
fig.update_layout(
    title_text='Tree Of Most Common positive Words',
    annotations=[dict(text=' positive ', x=0.5, y=0.5, font_size=20, showarrow=False)])
fig.show()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=[30, 15])
wordcloud1 = WordCloud( background_color='white',colormap="Greens",
                        width=600,
                        height=400).generate(" ".join(temp_positive['Common_words']))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Positive Selected Text',fontsize=35);

wordcloud2 = WordCloud( background_color='white',colormap="Reds",
                        width=600,
                        height=400).generate(" ".join(temp_negative['Common_words']))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Negative Selected Text',fontsize=35);

wordcloud3 = WordCloud( background_color='white',colormap="Blues",
                        width=600,
                        height=400).generate(" ".join(temp_neutral['Common_words']))
ax3.imshow(wordcloud3)
ax3.axis('off')
ax3.set_title('Neutral Selected Text',fontsize=35);


In [None]:

from tqdm import tqdm
import tokenizers

In [None]:
MAX_LEN = 96

# Pretrained model of roberta
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}


In [None]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv').fillna('')
ct=train.shape[0] #27481

# Initialising training inputs
input_ids=np.ones((ct,MAX_LEN),dtype="int32")          # Array with value 1 of shape(27481,96)
attention_mask=np.zeros((ct,MAX_LEN),dtype="int32")    # Array with value 0 of shape(27481,96)
token_type_ids=np.zeros((ct,MAX_LEN),dtype="int32")    # Array with value 0 of shape(27481,96)
start_tokens=np.zeros((ct,MAX_LEN),dtype="int32")      # Array with value 0 of shape(27481,96)
end_tokens=np.zeros((ct,MAX_LEN),dtype="int32")        # Array with value 0 of shape(27481,96)


In [None]:
for k in range(train.shape[0]):
#1 FIND OVERLAP
    text1 = " "+" ".join(train.loc[k,'text'].split())
    text2 = " ".join(train.loc[k,'selected_text'].split())
    
    # idx - position where the selected text are placed. 
    idx = text1.find(text2)   # we get [12] position
    
    # all character position as 0 and then places 1 for selected text position  
    chars = np.zeros((len(text1))) 
    chars[idx:idx+len(text2)]=1    # [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] 
    
    #tokenize id of text 
    if text1[idx-1]==' ': chars[idx-1] = 1    
    enc = tokenizer.encode(text1)  #  [127, 3504, 16, 11902, 162]
        
#2. ID_OFFSETS - start and end index of text
    offsets = []
    idx=0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))     #  [(0, 3), (3, 8), (8, 11), (11, 20), (20, 23)]
        idx += len(w) 
        toks = []
    for i,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b]) # number of characters in selected text - [0.0,0.0,0.0,9.0,3.0] - bullying me
        if sm>0: 
            toks.append(i)  # token position - selected text - [3, 4]
        
    s_tok = sentiment_id[train.loc[k,'sentiment']] # Encoded values by tokenizer
    
    #Formating input for roberta model
    input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]   #[ 0   127  3504    16 11902   162     2     2  2430     2]
    attention_mask[k,:len(enc.ids)+5] = 1                                  # [1 1 1 1 1 1 1 1 1 1]
    
    if len(toks)>0:
        # this will produce (27481, 96) & (27481, 96) arrays where tokens are placed
        start_tokens[k,toks[0]+1] = 1
        end_tokens[k,toks[-1]+1] = 1 
    

In [None]:
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv').fillna('')

ct_test = test.shape[0]

# Initialize inputs
input_ids_t = np.ones((ct_test,MAX_LEN),dtype='int32')        # array with value 1 for shape (3534, 96)
attention_mask_t = np.zeros((ct_test,MAX_LEN),dtype='int32')  # array with value 0 for shape (3534, 96)
token_type_ids_t = np.zeros((ct_test,MAX_LEN),dtype='int32')  # array with value 0 for shape (3534, 96)

# Set Inputs attention 
for k in range(test.shape[0]):
        
#1. INPUT_IDS
    text1 = " "+" ".join(test.loc[k,'text'].split())
    enc = tokenizer.encode(text1)                
     
    # Encoded value of tokenizer
    s_tok = sentiment_id[test.loc[k,'sentiment']]
    
    #setting up of input ids - same as we did for train
    input_ids_t[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask_t[k,:len(enc.ids)+5] = 1