In [None]:
import tensorflow.keras as keras
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv(
    '../input/twitter-airline-sentiment/Tweets.csv',
    encoding='latin-1')

In [None]:
df= df[['airline_sentiment','text']].copy()
df.columns = ['Sentiment','SentimentText']

In [None]:
df.Sentiment.value_counts()

In [None]:
mapper = {'negative':0,
         'neutral':1,
         'positive':2}

In [None]:
df.Sentiment = df.Sentiment.map(mapper)

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [None]:
def preprocess(text):
# removendo link,user e caracteres e especiais
# removing links,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    return text

In [None]:
df.SentimentText = df.SentimentText.apply(lambda x: preprocess(x))
df.Sentiment = df.Sentiment.astype('int64')

In [None]:
df.isnull().sum()

In [None]:
train, valid = train_test_split(df,random_state=56)

In [None]:
labels = keras.utils.to_categorical(train['Sentiment'].astype('int64'))
train_text = np.array(train['SentimentText'].tolist().copy())

In [None]:
labels_valid = keras.utils.to_categorical(valid['Sentiment'].astype('int64'))
valid_text = np.array(valid['SentimentText'].tolist().copy())


In [None]:
vocab_size = 1000
embedding_dim = 16
max_length = 142
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"


tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_text)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(train_text)
padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type, 
                       truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(valid_text)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length, 
                               padding=padding_type, truncating=trunc_type)

In [None]:
testing_padded

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[0]))
print()
print(train_text[0])

In [None]:
# Build a basic sentiment network
# Note the embedding layer is first, 
# and the output is only 1 softmax layer [0, 1 or 2(negative, neutral or positive)]
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
#just adding some early stopping to prevent overfitting
monitor = keras.callbacks.EarlyStopping(patience=5,min_delta=0.01,monitor='val_accuracy',restore_best_weights=True)

In [None]:
num_epochs = 30
history = model.fit(padded, labels, epochs=num_epochs, validation_data=(testing_padded, labels_valid),
         callbacks=[monitor]);

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
 
epochs = range(max(history.epoch)+1)
 
plt.plot(epochs, acc, label='Training acc')
plt.plot(epochs, val_acc, label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
 
plt.figure()
 
plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss, label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
 
plt.show()

In [None]:
#Running the prediction on the test split
predictions = np.argmax(model.predict(testing_padded),-1)

In [None]:
# We can inverse our dict to easy apply map function to transform the numbers into classes again
mapper_inverse = {v:k for v,k in zip(range(3),mapper)}
df_comp = pd.DataFrame()
df_comp['Pred'] = pd.Series(predictions).map(mapper_inverse)
df_comp['True'] = valid['Sentiment'].map(mapper_inverse).values
df_comp

In [None]:
#Let's see our accuracy!
accuracy_score(predictions,valid['Sentiment'].values)