In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.dropna(inplace=True)

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train['sentiment'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.countplot(x='sentiment', data = train)

In [None]:
temp = train.groupby('sentiment').count()['text'].reset_index().sort_values(by='text', ascending=False)

In [None]:
temp

In [None]:
from plotly import graph_objs as go
import plotly.express as px

In [None]:
fig = go.Figure(go.Funnelarea(text=temp['sentiment'], values=temp['text']))
fig.show()

In [None]:
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c))/(len(a)+len(b)-len(c))

In [None]:
result_jaccard = []
for ind, row in train.iterrows():
    str1 = row.text
    str2 = row.selected_text
    jaccard_score = jaccard(str1, str2)
    result_jaccard.append([str1, str2, jaccard_score])

In [None]:
jaccard = pd.DataFrame(result_jaccard, columns=['text', 'selected_text', 'jaccard_score'])
train = train.merge(jaccard, how='outer')

In [None]:
train.head()

In [None]:
train['num_words_st'] = train['selected_text'].apply(lambda x: len(str(x).split()))
train['num_words_t'] = train['text'].apply(lambda x: len(str(x).split()))
train['difference_in_words'] = train['num_words_t'] - train['num_words_st']

In [None]:
train.head()

In [None]:
import plotly.figure_factory as ff

In [None]:
hist_data = [train['num_words_st'], train['num_words_t']]

group_labels = ['Selected_text', 'Text']

fig = ff.create_distplot(hist_data, group_labels, show_curve=False)
fig.update_layout(title_text='Distribution of Number Of words')
fig.update_layout(
    autosize=False,
    width=900,
    height=700,
    paper_bgcolor="LightSteelBlue",
)
fig.show()

In [None]:
fig = plt.figure(figsize=(12, 6))

p1 = sns.kdeplot(train['num_words_st'], shade=True, color='r')
p2 = sns.kdeplot(train['num_words_t'], shade=True, color='b')

In [None]:
fig = plt.figure(figsize=(12, 6))

sns.kdeplot(train[train['sentiment']=='positive']['difference_in_words'], shade=True, color='b')
sns.kdeplot(train[train['sentiment']=='negative']['difference_in_words'], shade=True, color='r')

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(train[train['sentiment']=='neutral']['difference_in_words'], kde=False)

In [None]:
fig = plt.figure(figsize=(12, 6))

sns.kdeplot(train[train['sentiment']=='positive']['jaccard_score'], shade=True, color='b')
sns.kdeplot(train[train['sentiment']=='negative']['jaccard_score'], shade=True, color='r')

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(train[train['sentiment']=='neutral']['jaccard_score'], kde=False)

In [None]:
k = train[train['num_words_t'] <= 2]

k.groupby('sentiment').mean()['jaccard_score']

In [None]:
k[k['sentiment'] == 'positive']

In [None]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
train['text'] = train['text'].apply(lambda x: clean_text(x))
train['selected_text'] = train['selected_text'].apply(lambda x: clean_text(x))

In [None]:
train.head()

In [None]:
train['temp_list'] = train['selected_text'].apply(lambda x: str(x).split())

In [None]:
from collections import Counter
top = Counter([word for text in train['temp_list'] for word in text])

In [None]:
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_word', 'count']
temp.style.background_gradient(cmap='Blues')

In [None]:
fig = px.bar(temp, y='Common_word', x='count', title='Commmon Words in Selected Text', orientation='h', 
             width=700, height=700,color='Common_word' )
fig.show()

In [None]:
import nltk
from nltk.corpus import stopwords

def remove_stopwords(text):
    return [word for word in text if word not in stopwords.words('english')]

In [None]:
train['temp_list'] = train['temp_list'].apply(lambda x: remove_stopwords(x))

In [None]:
top = Counter([word for text in train['temp_list'] for word in text])
temp = pd.DataFrame(top.most_common(20))
temp = temp.iloc[1:, :]
temp.columns = ['Common_word', 'count']
temp.style.background_gradient(cmap='Blues')

In [None]:
fig = px.treemap(temp, path=['Common_word'], values='count', title='tree map of common words')
fig.show()

In [None]:
train['temp_list1'] = train['text'].apply(lambda x: str(x).split())
train['temp_list1'] = train['temp_list1'].apply(lambda x: remove_stopwords(x))

In [None]:
top = Counter([word for text in train['temp_list1'] for word in text])
temp = pd.DataFrame(top.most_common(20))
temp = temp.iloc[1:, :]
temp.columns = ['Common_word', 'count']
temp.style.background_gradient(cmap='Blues')

In [None]:
fig = px.bar(temp, y='Common_word', x='count', title='Commmon Words in Selected Text', orientation='h', 
             width=700, height=700,color='Common_word' )
fig.show()

In [None]:
positive_sent = train[train['sentiment']=='positive']
negative_sent = train[train['sentiment']=='negative']
neutral_sent = train[train['sentiment']=='neutral']

In [None]:
top = Counter([word for text in positive_sent['temp_list'] for word in text])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_word', 'count']
temp.style.background_gradient(cmap='Blues')

In [None]:
top = Counter([word for text in negative_sent['temp_list'] for word in text])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_word', 'count']
temp.style.background_gradient(cmap='Reds')

In [None]:
top = Counter([word for text in neutral_sent['temp_list'] for word in text])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_word', 'count']
temp.style.background_gradient(cmap='Blues')

In [None]:
raw_text = [word for word_list in train['temp_list1'] for word in word_list]

In [None]:
def unique_words(sentiment, num_words, raw_text):
    
    allother = []
    for words in train[train['sentiment'] != sentiment]['temp_list1']:
        for word in words:
            allother.append(word)
    allother = list(set(allother))
            
    specificonly = [word for word in raw_text if word not in allother]
    
    mycounter = Counter()
    for words in train[train['sentiment'] == sentiment]['temp_list1']:
        for word in words:
            mycounter[word] += 1
            
    for word in list(mycounter):
        if word not in specificonly:
            del mycounter[word]
            
    unique_df = pd.DataFrame(mycounter.most_common(num_words), columns=['word', 'count'])
    return unique_df

In [None]:
unique_positive = unique_words('positive', 20, raw_text)
unique_positive

In [None]:
unique_negative = unique_words('negative', 20, raw_text)
unique_negative

In [None]:
unique_neutral = unique_words('neutral', 20, raw_text)
unique_neutral

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(24.0,16.0), color = 'white',
                   title = None, title_size=40, image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {'u', "im"}
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(background_color=color,
                    stopwords = stopwords,
                    max_words = max_words,
                    max_font_size = max_font_size, 
                    random_state = 42,
                    width=400, 
                    height=200,
                    mask = mask)
    wordcloud.generate(str(text))
    
    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask);
        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
        plt.title(title, fontdict={'size': title_size,  
                                  'verticalalignment': 'bottom'})
    else:
        plt.imshow(wordcloud);
        plt.title(title, fontdict={'size': title_size, 'color': 'black', 
                                  'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout()  
d = '/kaggle/input/masksforwordclouds/'


In [None]:
from PIL import Image
pos_mask = np.array(Image.open(d+ 'twitter_mask3.jpg'))
plot_wordcloud(neutral_sent.text,mask=pos_mask,color='white',max_font_size=100,title_size=30,title="WordCloud of Neutral Tweets")

In [None]:
plot_wordcloud(positive_sent.text,mask=pos_mask,title="Word Cloud Of Positive tweets",title_size=30)

In [None]:
plot_wordcloud(negative_sent.text,mask=pos_mask,title="Word Cloud Of Positive tweets",title_size=30)

In [None]:
import numpy as np
import pandas as pd

In [None]:
df_train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
df_test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
df_submission = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')

In [None]:
df_train['num_words_text'] = df_train['text'].apply(lambda x: len(str(x).split()))

In [None]:
df_train = df_train[df_train['num_words_text'] >= 3]

In [None]:
df_train.head()

In [None]:
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
def save_model(output_dir, nlp, new_model_name):
    output_dir = f'../working/{output_dir}'
    if output_dir is not None:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        nlp.meta['name'] = new_model_name
        nlp.to_disk(output_dir)
        print('Saved model to', output_dir)

In [None]:
def train(train_data, output_dir, n_iter=20, model=None):
    """Load the model set up the pipeline and train the entity recognizer"""
    if model is not None:
        nlp = spacy.load(output_dir)
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")
        print("Created blank 'en' model")
        
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")
        
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()
            
        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, drop=0.5, losses=losses)
            print("Losses", losses)
    save_model(output_dir, nlp, 'st_ner')

In [None]:
def get_model_out_path(sentiment):
    '''Return model output path'''
    model_out_path = None
    if sentiment == 'positive':
        model_out_path = 'models/model_pos'
    elif sentiment == 'negative':
        model_out_path = 'models/model_neg'
    return model_out_path

In [None]:
def get_training_data(sentiment):
    '''Returns training data in the format needed to train spacy NER'''
    
    train_data = []
    for index, row in df_train.iterrows():
        if row.sentiment == sentiment:
            selected_text = row.selected_text
            text = row.text
            start = text.find(selected_text)
            end = start + len(selected_text)
            train_data.append((text, {"entities": [[start, end, 'selected_text']]}))
    return train_data

In [None]:
sentiment = 'positive'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
train(train_data, model_path, n_iter=5, model=None)

In [None]:
sentiment = 'negative'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)

train(train_data, model_path, n_iter=5, model=None)

In [None]:
def predict_entities(text, model):
    doc = model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
    return selected_text
            

In [None]:
selected_texts = []
MODELS_BASE_PATH = '../input/tse-spacy-model/models/'

if MODELS_BASE_PATH is not None:
    print("Loading Models from", MODELS_BASE_PATH)
    model_pos = spacy.load(MODELS_BASE_PATH + 'model_pos')
    model_neg = spacy.load(MODELS_BASE_PATH + 'model_neg')
    
    for index, row in df_test.iterrows():
        text = row.text
        output_str = ""
        if row.sentiment == 'neutral' or len(text.split()) <= 2:
            selected_texts.append(text)
        elif row.sentiment == 'positive':
            selected_texts.append(predict_entities(text, model_pos))
        else:
            selected_texts.append(predict_entities(text, model_neg))
    df_test['selected_text'] = selected_texts

In [None]:
df_test.head()

In [None]:
import os
os.chdir(r'/kaggle/working')

In [None]:
df_submission['selected_text'] = df_test['selected_text']
df_submission.to_csv("/kaggle/working/submission.csv", index=False)
df_submission.head(10)