In [None]:
!ls /kaggle/input/tweet-sentiment-extraction
input_dir = '/kaggle/input/tweet-sentiment-extraction'

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os

In [None]:
train_csv = pd.read_csv(os.path.join(input_dir, 'train.csv'))
train_csv.head()

In [None]:
texts = train_csv['text'].values.astype(np.str)
str_labels = train_csv['sentiment'].values

texts, str_labels

In [None]:
#  ONEHOT ENCODE LABELS
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

encoded_labels = le.fit_transform(str_labels)

labels = tf.one_hot(encoded_labels, 3).numpy() # neutral, negative, positive

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

maxlen = len(max(texts, key = len).split())
training_samples = 20481   # of 27481
validation_samples = 7000 # of 27481
max_words = 10000

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen = maxlen)

print("Shape of data tensor: ", data.shape)
print("Shape of label tensor: ", labels.shape)

In [None]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, SimpleRNN

model = Sequential()
model.add(Embedding(max_words, 16, input_length=maxlen))
model.add(SimpleRNN(32))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(3, activation = 'softmax'))
model.summary()


In [None]:
model.compile(optimizer = 'rmsprop',
             loss = 'categorical_crossentropy',
             metrics = ['acc'])


history = model.fit(x_train, y_train,
                   epochs = 10,
                   batch_size = 32,
                   validation_data = (x_val, y_val),
                   callbacks = [
                       tf.keras.callbacks.ModelCheckpoint(
                        'model_save.h5',
                        monitor="val_loss",
                        verbose=0,
                        save_best_only=True,
                        save_weights_only=True,
                        mode="auto",
                        save_freq="epoch"
                        )
                   ])

# TEST

In [None]:
test_csv = pd.read_csv(os.path.join(input_dir, 'test.csv'))
test_csv.head()

In [None]:
test_texts = test_csv['text'].values.astype(np.str)
test_str_labels = test_csv['sentiment'].values

test_texts, test_str_labels

In [None]:
#  ONEHOT ENCODE LABELS
test_le = LabelEncoder()

test_encoded_labels = test_le.fit_transform(test_str_labels)

test_labels = tf.one_hot(test_encoded_labels, 3).numpy() # neutral, negative, positive

In [None]:
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(test_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

test_data = pad_sequences(test_sequences, maxlen = maxlen)

print("Shape of data tensor: ", test_data.shape)
print("Shape of label tensor: ", test_labels.shape)

In [None]:
model.load_weights('./model_save.h5')
model.evaluate(test_data, test_labels)