# 1. Load libraries

In [None]:
import datetime
import collections
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
from keras.layers import Concatenate, BatchNormalization, Bidirectional
from keras.layers import SimpleRNN, Input, Dropout, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Activation
from keras.layers.recurrent import LSTM
import tensorflow as tf
from tensorflow.keras.models import load_model, Model
from tensorflow.keras import regularizers, optimizers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.preprocessing.text import Tokenizer
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from tqdm import tqdm
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

# load in all the modules we're going to need

# 2. Functions

In [None]:
stop_words = stopwords.words('english')
stemmer = nltk.SnowballStemmer("english")


def read_train():
    train = pd.read_csv(
        '../input/tweet-sentiment-extraction/train.csv').dropna()
    train['text'] = train['text']
    train['selected_text'] = train['selected_text']
    return train


def read_test():
    test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv').dropna()
    test['text'] = test['text']
    return test


def clean_text(text):
    '''
        Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.
    '''
    text = str(text).lower()
    pat = r"[{}]".format(string.punctuation) # create the pattern
    text = re.sub(pat, ' ', text) # remove punctuation
    text = re.sub('  +', ' ', text) # remove spaces
    text = re.sub('https?://|www\.', '', text)  # remove urls
    # text = re.sub('<.*?>', '', text) 
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text) # remove digits
    return text


def preprocess_data(text):
    """
       Clean puntuation, urls, and so on, 
       removing stop-words and make stemming
    """
    text = text if clean_text(text)==0 else clean_text(text)
    # Remove stop-words
    text = ' '.join(word for word in text.split()
                    if word not in stop_words)    # Remove stopwords
    # Stemm all the words in the sentence
    # text = ' '.join(stemmer.stem(word) for word in text.split())
    return text


def wrong_words(text, selected):
    words = []
    text = text.split()
    selected = selected.split()
    for i in selected:
        if i not in text:
            words.append(i)
    if len(words) > 0:
        return " ".join(words)
    else:
        return '++++'


def remove_text(x):
    selected = x[0]
    spelling = x[1]
    selected = selected.split()
    selected.remove(spelling)
    return " ".join(selected)


def remove_text_end(x):
    selected = x[0]
    spelling = x[1]
    selected = selected.split()
    if selected[-1] == spelling:
        selected.remove(spelling)
    return " ".join(selected)


def matching(x, tg):
    text = x[0]
    selected = x[1]
    spelling = x[2]
    text = text.split()
    selected = selected.split()
    spelling = spelling.split()
    for s in spelling:
        for t in text:
            if s in selected:
                if(fuzz.ratio(t, s) > tg):
                    index = selected.index(s)
                    selected[index] = t
    return " ".join(selected)


def start_index(x):
    text = x[0]
    selected = x[1]
    text = text.split()
    selected = selected.split()
    try:
        word = selected[0]
        index = text.index(word)
    except:
        index = 0
    return index


def end_index(x):
    text = x[0]
    selected = x[1]
    start_index = x[2]
    text = text.split()
    selected = selected.split()
    word = selected[-1]
    try:
        index = text.index(word, start_index)
    except:
        try:
            index = text.index(word)
        except:
            index = len(text) - 1
    return index


def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


def get_text(x):
    pred = []
    text = x[0]
    index = x[1]
    text = text.split()
    l = len(text)
    for i in index:
        if i < l:
            pred.append(text[i])
    return pred

# 3. Load data

In [None]:
train = read_train()
train.head()

In [None]:
test = read_test()
test.head()

# 4. Data processing

#### Clearing text

In [None]:
train['clean_text'] = train['text'].apply(preprocess_data)
train['clean_selected'] = train['selected_text'].apply(preprocess_data)
test['clean_text'] = test['text'].apply(preprocess_data)

#### Removing rows with blank Text or Selected text

In [None]:
print(train[train["clean_text"] == ' ']['clean_text'].count())
print(train[train["clean_selected"] == ' ']['clean_selected'].count())
train.drop(train[train["clean_text"] == ' '].index, inplace=True)
train.drop(train[train["clean_selected"] == ' '].index, inplace=True)

#### Working with spelling

Lets mark all rows, where selected_text contain symbols not exists in text and count rows with correct spelling

In [None]:
train['spelling'] = train.apply(
    lambda x: wrong_words(x.clean_text, x.clean_selected), axis=1)
train[train['spelling'] == '++++'].count()

##### One or two wrong symbol

In [None]:
train[train['spelling'].apply(lambda x: len(x)) <= 2]

and remove them

In [None]:
train['clean_selected'] = train[['clean_selected', 'spelling']].apply(
    lambda x: remove_text(x) if len(x['spelling']) <= 2 else x['clean_selected'], axis=1)
train['spelling'] = train.apply(
    lambda x: wrong_words(x.clean_text, x.clean_selected), axis=1)
train[train['spelling'] == '++++'].count()

In [None]:
train['len_cl_txt'] = train['clean_text'].apply(lambda x: len(x))
train['len_cl_sel'] = train['clean_selected'].apply(lambda x: len(x))

In [None]:
# Check lenght on text and selected text
train[train['len_cl_txt'] < train['len_cl_sel']]

In [None]:
train.loc[train['len_cl_txt'] < train['len_cl_sel'], 'clean_selected'] = train.loc[train['len_cl_txt'] < train['len_cl_sel'], 'clean_text']
train['spelling'] = train.apply(
    lambda x: wrong_words(x.clean_text, x.clean_selected), axis=1)
print(train[train['spelling'] == '++++'].count())
train['len_cl_txt'] = train['clean_text'].apply(lambda x: len(x))
train['len_cl_sel'] = train['clean_selected'].apply(lambda x: len(x))
train[train['len_cl_txt'] < train['len_cl_sel']]

For other cases we will use library fuzzywuzzy, which allow obtain degree of matching two sentences

In [None]:
print(fuzz.ratio('geeksforgeeks', 'geeksgeeks'))
print(fuzz.ratio('GeeksforGeeks', 'GeeksforGeeks'))

first step set matching level = 70% and look at result

In [None]:
"""train['selected_text'] = train[['text', 'selected_text', 'spelling']].apply(
    lambda x: matching(x, 70) if x['spelling'] != '++++' else x['selected_text'], axis=1)
train['spelling'] = train.apply(
    lambda x: wrong_words(x.text, x.selected_text), axis=1)
train[(train['spelling'] != '++++')]"""

next step - 55%

In [None]:
"""train['selected_text'] = train[['text', 'selected_text', 'spelling']].apply(
    lambda x: matching(x, 55) if x['spelling'] != '++++' else x['selected_text'], axis=1)
train['spelling'] = train.apply(
    lambda x: wrong_words(x.text, x.selected_text), axis=1)
train[(train['spelling'] != '++++')]"""

And finally - 35%

In [None]:
train['clean_selected'] = train[['clean_text', 'clean_selected', 'spelling']].apply(
    lambda x: matching(x, 35) if x['spelling'] != '++++' else x['clean_selected'], axis=1)
train['spelling'] = train.apply(
    lambda x: wrong_words(x.clean_text, x.clean_selected), axis=1)
train[(train['spelling'] != '++++')]

In [None]:
train['clean_selected'] = train[['clean_selected', 'spelling']].apply(
    lambda x: remove_text(x) if len(x['spelling']) <= 2 else x['clean_selected'], axis=1)
train['spelling'] = train.apply(
    lambda x: wrong_words(x.clean_text, x.clean_selected), axis=1)
train[train['spelling'] == '++++'].count()

Lets look at remaining part more carefully

In [None]:
train.loc[(train['spelling'] != '++++') & (train['sentiment'] == 'positive')]

In [None]:
train.loc[5189, 'clean_selected'] = 'fun'
train.loc[6395, 'clean_selected'] = 'great'

In [None]:
train.loc[(train['spelling'] != '++++') & (train['sentiment'] == 'negative')]

In [None]:
train.loc[6113, 'clean_selected'] = 'going die'

In [None]:
train.loc[(train['spelling']!='++++') & (train['sentiment']=='neutral')]

In [None]:
train['clean_selected'] = train[['clean_text', 'clean_selected', 'spelling']].apply(
    lambda x: matching(x, 35) if x['spelling'] != '++++' else x['clean_selected'], axis=1)
train['clean_selected'] = train[['clean_selected', 'spelling']].apply(
    lambda x: remove_text(x) if len(x['spelling']) == 1 else x['clean_selected'], axis=1)
train['spelling'] = train.apply(
    lambda x: wrong_words(x.clean_text, x.clean_selected), axis=1)

In [None]:
train['clean_selected'] = train[['clean_text', 'clean_selected', 'spelling']].apply(
    lambda x: x['clean_text'] if x['spelling'] != '++++' else x['clean_selected'], axis=1)

In [None]:
train['spelling'] = train.apply(
    lambda x: wrong_words(x.clean_text, x.clean_selected), axis=1)
train[(train['spelling'] != '++++')]

In [None]:
train.reset_index(inplace=True)
train.drop(['index'], inplace=True, axis=1)
train

In [None]:
train.drop(train[train['clean_selected'].apply(
    lambda x: len(x)) == 0].index, inplace=True)

In [None]:
train['start_index'] = train[['clean_text', 'clean_selected']].apply(
    lambda x: start_index(x), axis=1)
train['end_index'] = train[['clean_text', 'clean_selected', 'start_index']].apply(
    lambda x: end_index(x), axis=1)
train.head()

In [None]:
train[train.start_index > train.end_index]

In [None]:
train = train[train.start_index <= train.end_index]
train[train.start_index > train.end_index]

In [None]:
train.reset_index(inplace=True)
train.drop(['index'], inplace=True, axis=1)
print(train.shape)
train.head()

In [None]:
test.head()

In [None]:
# Max tweet length
text_split = train['clean_text'].apply(lambda x: len(str(x).split())).tolist()
max(text_split)

In [None]:
# train_ = train.loc[train.sentiment != 'neutral'].reset_index(drop=True, inplace=False)

y = np.zeros((train.shape[0], max(text_split)+1))
for i in range(train.shape[0]):
    start = train['start_index'][i]
    end = train['end_index'][i]
    y[i][start:end+1] = 1

In [None]:
for i in [1, 6, 11, 22]:
    print(train['start_index'][i], train['end_index'][i], '\n')
    print(train['clean_text'][i], '\n')
    print(train['clean_selected'][i], '\n')
    print(y[i])
    print("="*150)

In [None]:
y.shape

In [None]:
X = train[['textID', 'clean_text', 'clean_selected', 'sentiment']]
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.15, random_state=42)
print("X_train shape ", X_train.shape, "  X_test shape ", X_valid.shape)
print("y_train shape ", y_train.shape, "  y_test shape ", y_valid.shape)

In [None]:
y_train = np.expand_dims(y_train, -1)
y_valid = np.expand_dims(y_valid, -1)
y_train.shape, y_valid.shape

In [None]:
train_text = X_train['clean_text'].values
valid_text = X_valid['clean_text'].values
train_sentiment = X_train['sentiment'].values
valid_sentiment = X_valid['sentiment'].values

#### Tokenizing

In [None]:
# using keras tokenizer here
token1 = text.Tokenizer(num_words=None)
max_len_text = y_train.shape[1]-1

token1.fit_on_texts(list(train_text))
train_text = token1.texts_to_sequences(train_text)
valid_text = token1.texts_to_sequences(valid_text)


# zero pad the sequences
train_text = sequence.pad_sequences(
    train_text, maxlen=max_len_text, padding='post')
valid_text = sequence.pad_sequences(
    valid_text, maxlen=max_len_text, padding='post')

word_index_text = token1.word_index
# print(word_index_text)
print(train_text.shape, valid_text.shape)

# using keras tokenizer here
token2 = text.Tokenizer(num_words=None)
max_len_sentiment = 1

token2.fit_on_texts(list(train_sentiment))
train_sentiment = token2.texts_to_sequences(train_sentiment)
valid_sentiment = token2.texts_to_sequences(valid_sentiment)


# zero pad the sequences
train_sentiment = sequence.pad_sequences(
    train_sentiment, maxlen=max_len_sentiment, padding='post')
valid_sentiment = sequence.pad_sequences(
    valid_sentiment, maxlen=max_len_sentiment, padding='post')

word_index_sentiment = token2.word_index
print(word_index_sentiment)
print(train_sentiment.shape, valid_sentiment.shape)

#### Prepare embeddings

In [None]:
# load the GloVe vectors in a dictionary:
embeddings_index = {}
with open('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt') as f:
    for line in tqdm(f):
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray([float(val) for val in values[1:]])
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix_text = np.zeros((len(word_index_text) + 1, 300))
for word, i in tqdm(word_index_text.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_text[i] = embedding_vector

print(embedding_matrix_text.shape)

# create an embedding matrix for the words we have in the dataset
embedding_matrix_sentiment = np.zeros((len(word_index_sentiment) + 1, 300))
for word, i in tqdm(word_index_sentiment.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_sentiment[i] = embedding_vector

print(embedding_matrix_sentiment.shape)

### Model

In [None]:
text_input = Input(shape=(max_len_text,), name='text_input')
embd_text = Embedding(len(word_index_text)+1,  # embedding layer with glove vectors as embeddings
                      300,
                      weights=[embedding_matrix_text],
                      input_length=max_len_text,
                      trainable=False, mask_zero=True, name='embedding_text')(text_input)  # masking the input values with mask_zero= True


sentiment_input = Input(shape=(max_len_sentiment,), name='sentiment_input')
embd_sentiment = Embedding(len(word_index_sentiment)+1,  # embedding layer with glove vectors as embeddings
                           300,
                           weights=[embedding_matrix_sentiment],
                           input_length=max_len_text,
                           trainable=False, mask_zero=True, name='embedding_sentiment')(sentiment_input)  # masking the input values with mask_zero= True


con = Concatenate(axis=1)([embd_text, embd_sentiment])

lstm = Bidirectional(LSTM(128, return_sequences=True,
                          dropout=0.15, name='LSTM'))(con)  # lstm

# dense layers with drop outs and batch normalization
m = Dense(128, activation="relu",
          kernel_regularizer=regularizers.l2(0.0001))(lstm)
m = Dropout(0.3)(m)
m = BatchNormalization()(m)
output = Dense(1, activation='sigmoid', name='output')(m)

model = Model(inputs=[text_input, sentiment_input], outputs=[output])

In [None]:
tf.keras.utils.plot_model(
    model, 'Model.png', show_shapes=True, show_layer_names=True)

In [None]:
model.summary()

In [None]:
log_dir = os.path.join(
    "logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard = tf.keras.callbacks.TensorBoard(
    log_dir=log_dir, histogram_freq=1, write_graph=True, write_grads=True)

checkpoint_filepath = 'LSTM_model.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath, monitor='val_loss', save_best_only=True, verbose=1)
adam = optimizers.Adam(0.001)

model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
callback = [model_checkpoint_callback, tensorboard]

history = model.fit([train_text, train_sentiment],
                    y_train,
                    epochs=10,
                    batch_size=64,
                    validation_data=([valid_text, valid_sentiment], [y_valid]),
                    verbose=1,
                    callbacks=callback)

In [None]:
plt.figure(figsize=(7, 4))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'])
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(7, 4))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.ylabel('Accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'])
plt.grid()
plt.show()

In [None]:
train_pred = model.predict([train_text, train_sentiment])
train_pred = np.squeeze(train_pred)
train_pred = np.round(train_pred)
train_pred.shape

In [None]:
pred = []
for vector in train_pred:
    index = []
    for i, value in enumerate(vector):
        if value == 1:
            index.append(i)
    pred.append(np.array(index))
print(len(pred))

In [None]:
X_train['prediction'] = pred
pred_text = X_train[['clean_text', 'prediction']].apply(
    lambda x: get_text(x), axis=1)
X_train['pred_text'] = pred_text
X_train['pred_text'] = X_train['pred_text'].apply(lambda x: ' '.join(x))
X_train.head()

In [None]:
X_train['jaccard'] = X_train.apply(
    lambda x: jaccard(x.clean_selected, x.pred_text), axis=1)
print('Mean training Jaccard score:', np.mean(X_train['jaccard']))
print("="*150)
print('nMean jaccard score for positive sentiment tweets:', np.mean(
    X_train[X_train['sentiment'] == 'positive']['jaccard']))
print("="*150)
print('Mean jaccard score for negative sentiment tweets', np.mean(
    X_train[X_train['sentiment'] == 'negative']['jaccard']))
print("="*150)
print('Mean jaccard score for neutral sentiment tweets', np.mean(
    X_train[X_train['sentiment'] == 'neutral']['jaccard']))

In [None]:
valid_pred = model.predict([valid_text, valid_sentiment])
valid_pred = np.squeeze(valid_pred)
valid_pred = np.round(valid_pred)
valid_pred.shape

In [None]:
pred = []
for vector in valid_pred:
    index = []
    for i, value in enumerate(vector):
        if value == 1:
            index.append(i)
    pred.append(np.array(index))
print(len(pred))

In [None]:
X_valid['prediction'] = pred
pred_text = X_valid[['clean_text', 'prediction']].apply(
    lambda x: get_text(x), axis=1)
X_valid['pred_text'] = pred_text
X_valid['pred_text'] = X_valid['pred_text'].apply(lambda x: ' '.join(x))
X_valid.head()

In [None]:
X_valid['jaccard'] = X_valid.apply(
    lambda x: jaccard(x.clean_selected, x.pred_text), axis=1)
print('Mean training Jaccard score:', np.mean(X_valid['jaccard']))
print("="*150)
print('Mean jaccard score for positive sentiment tweets:', np.mean(
    X_valid[X_valid['sentiment'] == 'positive']['jaccard']))
print("="*150)
print('Mean jaccard score for negative sentiment tweets', np.mean(
    X_valid[X_valid['sentiment'] == 'negative']['jaccard']))
print("="*150)
print('Mean jaccard score for neutral sentiment tweets', np.mean(
    X_valid[X_valid['sentiment'] == 'neutral']['jaccard']))

In [None]:
model = load_model('LSTM_model.h5')
test_text = test['clean_text'].values
test_sentiment = test['sentiment'].values
test_text = token1.texts_to_sequences(test_text)
test_text = sequence.pad_sequences(
    test_text, maxlen=max_len_text, padding='post')
test_sentiment = token2.texts_to_sequences(test_sentiment)
test_sentiment = sequence.pad_sequences(
    test_sentiment, maxlen=max_len_sentiment, padding='post')

test_pred = model.predict([test_text, test_sentiment])
test_pred = np.squeeze(test_pred)
test_pred = np.round(test_pred)
test_pred.shape

In [None]:
pred = []
for vector in test_pred:
    index = []
    for i, value in enumerate(vector):
        if value == 1:
            index.append(i)
    pred.append(np.array(index))
print(len(pred))

test['prediction'] = pred
pred_text = test[['clean_text', 'prediction']].apply(lambda x: get_text(x), axis=1)
test['selected_text'] = pred_text
test['selected_text'] = test['selected_text'].apply(lambda x: ' '.join(x))
test.drop(['text', 'sentiment', 'prediction'], axis=1, inplace=True)
test = test[['textID', 'selected_text']]
test.head()

In [None]:
test.to_csv("submission.csv", index=False)