# Twitter Sentiment Analysis

## 1. Import necessary libraries:

In [None]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.keras as keras 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout 
print('You are using TensorFlow version: ',tf.__version__)

In [None]:
# use this to stretch the dataframe view
pd.set_option('display.max_colwidth', None)

## 2. Prepare the data:

The keyword and location columns are not needed, so we remove them

In [None]:
root_folder = '/kaggle/input/nlp-getting-started/'

In [None]:
df_train = pd.read_csv(root_folder+'train.csv', sep=',', encoding='ISO-8859-1')
df_test = pd.read_csv(root_folder+'test.csv', sep=',', encoding='ISO-8859-1')
df_train.head(10)

In [None]:
print("Number of empty entries: ",np.sum(df_train.isnull().any(axis=1)))

In [None]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

STOPWORDS = set(stopwordlist)

In [None]:
#clean data by removing special symbols (like # and others), URL's and stop-words:

def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def remove_urls(text):
    new_text = re.sub(r'http?:\/\/.*[\r\n]*', "", text)
    new_text = re.sub(r'https?:\/\/.*[\r\n]*', "", new_text)
    #new_text = re.sub(r'@[a-zA-Z0-9]', "", new_text)
    new_text = ' '.join(x for x in new_text.split() if not x.startswith('@'))
    return new_text.casefold().strip()

def remove_specials(text):
    #new_text = re.sub(r"[^a-zA-Z0-9\s]",'',text)
    new_text = re.sub(r"[^a-zA\s]",'',text)
    new_text=new_text.replace("#","").strip()
    return new_text.strip()

In [None]:
df_train['text'] = df_train.text.apply(remove_urls).dropna()
df_test['text'] = df_test.text.apply(remove_urls)
df_train.sample(10)

In [None]:
df_train['text'] = df_train.text.apply(remove_specials)
df_test['text'] = df_test.text.apply(remove_specials)
df_train.sample(10)

In [None]:
df_train['text'] = df_train.text.apply(lambda x: cleaning_stopwords(x))
df_test['text'] = df_test.text.apply(lambda x: cleaning_stopwords(x))
df_train = df_train.drop(['keyword','location'], axis=1)
df_test = df_test.drop(['keyword','location'], axis=1)
df_train.sample(10)

## 3. Setting the training and test variables:

In [None]:
x = df_train.sort_values(by= ['id'], ascending=True)
x_test = df_test.sort_values(by= ['id'], ascending=True)

In [None]:
#Retrieve the text from the dataframe as a numpy array
twitts = x.loc[:,x.columns=='text'].values.flatten()
twitts_test = x_test.loc[:,x_test.columns=='text'].values.flatten()

In [None]:
# see a sample of the collected twitts:
print(twitts[:10])

## 4. Applying NLP:
We need to use the usual tokenization techniques in order to translate the text data to numerical vectors that we can feed to a neural network.

In [None]:
t = Tokenizer()
def preprocess(text):
    seqs = t.fit_on_texts(text)
    return seqs

In [None]:
#fit tokenizer on training set:
tokens=preprocess(twitts)

In [None]:
#text to sequences
X_train = t.texts_to_sequences(twitts)
X_test = t.texts_to_sequences(twitts_test)

In [None]:
#pad sequences
X_train = np.array(X_train, dtype=object)
X_test = np.array(X_test, dtype=object)
X_train = keras.preprocessing.sequence.pad_sequences(X_train)
X_test = keras.preprocessing.sequence.pad_sequences(X_test)

In [None]:
# this is a single encoded twitt:
print(X_train[0])

In [None]:
#size of the vocabulary
vocab_size = len(t.word_index)
print('Size of vocabulary:', vocab_size)

In [None]:
# target training variable:
y_train = df_train.target.values

In [None]:
# the y variable represents the labels (1 for true disaster, 0 otherwise):
print(y_train[0])

In [None]:
# validation split
from sklearn.model_selection import train_test_split
x_train, x_val, y, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print("Training data shape: ", x_train.shape)
print("Validation data shape: ", x_val.shape)

## 5. Training the model RNN:

In [None]:
#create the model (try experimenting with your own architectures )
#create the model
model = Sequential()
model.add(Embedding(input_dim = vocab_size+2, output_dim = 128, name='Embedding'))
model.add(tf.keras.layers.LSTM(128, return_sequences=True))
model.add(tf.keras.layers.LSTM(128))
#model.add(Dense(128, activation = 'relu'))
#model.add(LSTM(64))
#model.add(Dropout(0.3))
#model.add(Dense(32, activation = 'relu'))
model.add(Dense(1))
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.Adam(1e-4), metrics=["accuracy"])
model.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)
history = model.fit(x_train, y, validation_data=(x_val,y_val), shuffle=True, epochs=5, batch_size=64)

In [None]:
train_score = model.evaluate(x_val, y_val)

In [None]:
# Plot history: Binary Cross-entropy
plt.plot(history.history['loss'], label='BCE (training data)')
plt.plot(history.history['val_loss'], label='BCE (validation data)')
plt.title('BCE for disaster Twitts')
plt.ylabel('BCE value')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")

plt.show()

In [None]:
# Plot history: Accuracy
plt.plot(history.history['accuracy'], label='acc (training data)')
plt.plot(history.history['val_accuracy'], label='acc (validation data)')
plt.title('Accuracy for disaster Twitts')
plt.ylabel('Accuracy value')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.show()

## 6. Calculating the F1 Score on validation set: 

In [None]:
#calculate F1 score
from sklearn.metrics import f1_score

y_pred = np.where(model.predict(x_val)>0.5,1.0,0.0).flatten().astype('int32')
y_true = y_val
f1_score(y_true, y_pred)

## 6. Making predictions:
The results are collected in a dataframe with the twitts and the corresponding predicted labels

In [None]:
preds = np.where(model.predict(X_test)>0.5,1.0,0.0).flatten().astype('int32')

In [None]:
results = {'text':twitts_test, 'target':preds}
df_results = pd.DataFrame(results)
df_results.sample(20)

## 7. Compiling the submission file:
We need to to a bit of manipulations since the submission file must only contain the given twitt id with its corresponding prediction

In [None]:
submission = pd.merge(df_test, df_results, on='text').drop_duplicates()
submission.drop(['text'],axis=1).to_csv('submission.csv', index=False)