In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

np.random.seed(42)
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding

from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import numpy as np

from sklearn.model_selection import train_test_split

import random
from tqdm import tqdm
from babel.dates import format_date
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
ROOT = '/kaggle/'
INPUT_ROOT = ROOT + 'input/'
train_data = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
test_data = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
submission_data = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')
pd.set_option('display.max_colwidth', None)

In [None]:
def phrase_start_finder(s1, s2):
    if s2 not in s1:
        raise ValueError('s2 not substring of s1')
    start = s1.find(s2)
    return len(s1[:start].strip().split(' '))

def phrase_end_finder(s1, s2):
    if s2 not in s1:
        raise ValueError('s2 not substring of s1')
    return phrase_start_finder(s1, s2) + len(s2.strip().split(' ')) - 1

def start_finder(row):
    """
    Returns starting position of the phrase in the string when split by spaces.
    """
    return phrase_start_finder(row['prepended_text'], row['selected_text'])
    

def end_finder(row):
    """
    Returns ending position of the phrase in the string when split by spaces.
    """
    return phrase_end_finder(row['prepended_text'], row['selected_text'])

print(phrase_start_finder("bear is awesome but monkey is cool too", "awesome but monkey"))
phrase_end_finder("bear is awesome but monkey is cool too", "awesome but monkey")

# phrase_end_finder("bear is awesome but monkey is cool too", "not found")


In [None]:
train_data.dropna(inplace=True)
train_data['prepended_text'] = train_data['sentiment'] + ': ' + train_data['text']
train_data['prepended_text'] = train_data['prepended_text'].astype('str')
train_data['start'] = train_data.apply(start_finder, axis=1)
train_data['end'] = train_data.apply(end_finder, axis=1)
Tx = max(train_data.apply(lambda row : len(row['prepended_text'].split()), axis=1)) + 10# start and end of sentence words
print(Tx)
print(train_data.dtypes)
# Largest tweet has 34 words. Will truncate any tweets longer than this and pad the ones smaller than this
train_data.head(10)


In [None]:
def find_unique_words(data, column_name='prepended_text'):
    """
    Find and returns a set containing all unique words from a dataframe.
    """
    unique_words = []
    for index, row in data.iterrows():
        words = str(row['prepended_text']).split(' ')
        for word in words:
            unique_words.append(word)
    
    return unique_words

# Unit test
df = pd.DataFrame(data={'prepended_text': [
    'positive: this world; was great', 
    'negative: until COVID-19 happened',
    'neutral: now it is so-so'
    ]})
find_unique_words(df)

In [None]:
def load_embeddings(filter_words=None):
    """
    Loads glove embeddings. If filter_words is provided then the embedding dict returned only contains the filtered 
    words. Use this option if you only need a smaller set of words.
    """
    embeddings_index = dict()
    f = open(INPUT_ROOT + 'glove-twitter/glove.twitter.27B.50d.txt')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        if word == 'unk':
            embeddings_index[word] = coefs
            continue
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index

# Loaded 1193514 word vectors.
embeddings = load_embeddings(find_unique_words(train_data))
embeddings['morning']

In [None]:
def convert_sentence_to_embedding(sentence, embeddings, max_sentence_len=Tx):
    # TODO: Confirm that 'unk' is the unknown word
    unknown_word = 'unk'
    output = np.zeros((max_sentence_len, 50))
    words = sentence.split(' ')
    for j, word in enumerate(words):
        if word in embeddings:
            embedding = embeddings[word]
        else:
            embedding = embeddings[unknown_word]
        output[j] = embedding
    return output.reshape([max_sentence_len*50])

def convert_input_to_embeddings(data, embeddings, label='prepended_text', max_sentence_len=Tx):
    unknown_word = 'unk'
    output = np.zeros((data.shape[0], max_sentence_len, 50))
    i = 0
    for _, row in data.iterrows():
        sentence = ' '.join(row[label].split())
        words = sentence.split(' ')
        for j, word in enumerate(words):
            if word in embeddings:
                embedding = embeddings[word]
            else:
                embedding = embeddings[unknown_word]
            output[i, j] = np.array(embedding)
        i = i+1
    return output

def convert_to_vector(data, max_sentence_len=Tx):
    label_vector = np.zeros((data.shape[0], max_sentence_len))
    for i in range(len(data)):
        for j in range(max_sentence_len):
            if j >= data[i][0] and j <= data[i][1]:
                label_vector[i][j] = 1
    return label_vector

                
X_train, X_test, Y_train, Y_test = train_test_split(
    train_data[['prepended_text']], 
    train_data[['start', 'end', 'selected_text']].iloc[:, :].values,
    test_size=0.15,
    random_state=42
)
    
train_text = convert_input_to_embeddings(X_train, embeddings)
test_text = convert_input_to_embeddings(X_test, embeddings)
train_labels = convert_to_vector(Y_train)
test_labels = convert_to_vector(Y_test)

train_text = train_text.reshape((train_text.shape[0], Tx*50))
test_text = test_text.reshape((test_text.shape[0], Tx*50))

print(train_text.shape)
print(test_text.shape)
print(train_labels.shape)
print(test_labels.shape)
# m samples, Tx denotes max size of a sentence, 50 denotes embeddings.
# Note that it is common for last few embeddings to be 0.

In [None]:
def dnn(input_shape, output_shape=Tx):
    """
    Shape of one input: Tx, embeddings_size
    """
    X_input = Input(input_shape)
    
    X = Dense(128, activation='relu', name='fc1')(X_input)
    
    X = Dense(128, activation='relu', name='fc2')(X)
    X = Dropout(0.3)(X)
    X = BatchNormalization()(X)
    X = Dense(64, activation='relu', name='fc3')(X)
    X = Dropout(0.3)(X)
    X = BatchNormalization()(X)
    X = Dense(64, activation='relu', name='fc4')(X)
    X = Dropout(0.15)(X)
    X = BatchNormalization()(X)
    X = Dense(output_shape , activation='sigmoid', name='fc_last')(X)
    
    model = Model(inputs=X_input, outputs=X, name="DNN for sentimend extraction")
    return model

model = dnn([Tx*50])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(train_text, train_labels, epochs=8, batch_size=64, validation_data=(test_text, test_labels))

# Epoch 100/100
# 23358/23358 [==============================] - 2s 99us/step - loss: 0.0704 - accuracy: 0.9701 - val_loss: 0.5517 
#             - val_accuracy: 0.8561

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
test_input = "positive: today is a wonderful day but not so cold"
test_embedding = convert_sentence_to_embedding(test_input, embeddings)
print(test_embedding.shape)
test_embedding = test_embedding.reshape(1, -1)
ans = model.predict(test_embedding)
print(ans)

model.evaluate(test_text, test_labels)

In [None]:
def find_sentiment_segment(sentence, arr, threshold=0.05):
    """
    Takes a sentence and an array of scores and and computes the sentiment using scores based on the theshold.
    
    sentence: Original full sentence
    arr: An array containing scores for each word
    """
    words = sentence.split(' ')
    length = len(words)
    ans_i = 0
    ans_j = 0
    for i in range(1, length):
        for j in range(i, length):
            # Check if sentence from word i to j has score > threshold
            if len(sentence[i:j+1].strip()) == 0 or len(arr[i:j+1]) == 0:
                continue
            if min(arr[i:j+1]) > threshold and ans_j - ans_i < j - i:
                ans_i = i
                ans_j = j
    return ' '.join(words[ans_i:ans_j+1]).strip()
                
find_sentiment_segment(
    "positive: This is an amazingly amazing day for skiing",
    [0.002, 0.02, 0.011, 0.32, 0.323, 0.21, 0.15, 0.02, 0.002, 0.003], threshold=0.2)

In [None]:
test_output = model.predict(test_text)

In [None]:
# Evaluate one single example.
print(test_output.shape)
print(test_labels.shape)
print(test_output[0])
print(test_labels[0])
print(X_test.iloc[0].prepended_text)
print('Y_test')
print(Y_test[0])
sentiment_output = find_sentiment_segment(X_test.iloc[0].prepended_text, test_output[0], 0.000000000000001)
sentiment_output

In [None]:
def evaluate_iou(test_outputs, X_test, Y_test, threshold=1e-06):
    print("For threshold: " + str(threshold))
    print(Y_test.shape)
    assert len(test_outputs) == len(Y_test)
    assert len(test_outputs) == len(Y_test)
    ans = 0.
    labels_list = []
    predictions_list = []
    def jaccard(str1, str2): 
        a = set(str1.lower().split()) 
        b = set(str2.lower().split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    for i in range(len(test_outputs)):
        sentiment_output = find_sentiment_segment(X_test.iloc[i].prepended_text, test_outputs[i], threshold)
        jaccard_score = jaccard(Y_test[i][2], sentiment_output)
        labels_list.append(Y_test[i][2])
        predictions_list.append(sentiment_output)
        ans = ans + jaccard_score
    ans = ans/len(test_outputs)
    print("Resulting iou: " + str(ans))
    return ans, labels_list, predictions_list

def find_optimum_threshold():    
    ans = 0
    ret = 0
    responses = []
    for threshold in [0.000000005, 0.0000005, 0.0005, 0.005, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]:
#     for threshold in [0.000000005]:
        iou, l_list, p_list = evaluate_iou(test_output, X_test, Y_test, threshold)
        labels_predictions = pd.DataFrame()
        labels_predictions['labels'] = l_list
        labels_predictions['predictions'] = p_list
        labels_predictions.to_csv(str(threshold) + 'latest.csv', index=False)
        responses.append((threshold, iou))
        if iou > ans:
            ans = iou
            ret = threshold
        threshold = threshold * 2
    return ans, ret, responses
            
# ans, _, responses = find_optimum_threshold()
# print(ans)
# print(responses)


# For threshold: 5e-08
# Resulting iou: 0.5925133028085846
    
print('completed')

In [None]:

print(X_test.shape)
print(test_output.shape)
print(Y_test.shape)
print(test_output[0])
print(Y_test[0])
X_test

In [None]:
# Evaluate on test data

test_data['prepended_text'] = test_data['sentiment'] + ': ' + test_data['text']
test_data['prepended_text'] = test_data['prepended_text'].astype('str')

test_unique_words = find_unique_words(test_data)
test_embeddings = load_embeddings(test_unique_words)

In [None]:
test_data_embeddings = convert_input_to_embeddings(test_data, test_embeddings, label='prepended_text')

In [None]:
test_data_embeddings = test_data_embeddings.reshape(-1, Tx*50)
test_predictions = model.predict(test_data_embeddings)
test_predictions[0]

In [None]:
def convert_output_to_sentences(test_predictions, test_data):
#     assert len(test_predictions) == len(test_data)
    output = []
    for i in range(len(test_data)):
        output.append(find_sentiment_segment(test_data.iloc[i].prepended_text, test_predictions[i], 0.005))
    return np.array(output)

output = convert_output_to_sentences(test_predictions, test_data)
output = pd.DataFrame(output)
output.head()

In [None]:
import csv
result = test_data.join(output)
result = result[['textID', 0]]
result.rename(columns={0:'selected_text'}, inplace=True)
# result['selected_text'] = '"' + result['selected_text'] + '"'
# result.to_csv('submission.csv',quoting = csv.QUOTE_NONE,quotechar="",escapechar = ',', index=False)
result.head()

In [None]:
result[result['selected_text'] == ''].index
# df[df['column_name'] == ''].index

In [None]:
result[['selected_text']].to_numpy()
final = []
for i in range(len(result)):
    final.append(result.iloc[i].selected_text)
final[0]

In [None]:
df = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/sample_submission.csv")
df['selected_text'] = final
df['selected_text'] = df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
df['selected_text'] = df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
df['selected_text'] = df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)

df.to_csv('submission.csv', index=False)
df.head()