In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras
import csv
import pandas as pd
import random
import numpy as np
from keras import regularizers
import re
from tqdm import tqdm_notebook
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
import nltk
nltk.download('wordnet')
import itertools
from string import ascii_lowercase
from functools import reduce
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
path='/content/drive/MyDrive/projects/Twitter Sentiment Analysis/final_dataframe.csv'
df=pd.read_csv(path, encoding='latin')
df.drop(['Unnamed: 0'],axis=1,inplace=True)


In [None]:
df = df.dropna(how='any',axis=0)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
y = df.sentiment.values  
print(y[:5])

In [None]:
# processed_train_data=df.processed_tweets.values.tolist()

In [None]:
# processed_train_data[:5]

In [None]:
RE_PATTERNS = {
    ' sorry ' : [' soo rry '],
    ' working ' : [' workingg '],
    ' working ipod ' : [' workingipod '],
    ' tonight ' : [' tonightt '],
    ' fuck ' : [' fahk ', ' fcking '],
    ' thank ' : [' thanx '],
    ' come ' : [' comw '],
    ' yuck ' : [' yuk '],
    ' conversation ' : [' convo '],
    ' i do not know ' : [' idunno '],
    ' do not know ' : [' dunno ']
}

In [None]:
# Text Normalization

def clean_text(text,remove_repeat_text=True, remove_patterns_text=True, is_lower=True):

  if remove_patterns_text:
    for target, patterns in RE_PATTERNS.items():
      for pat in patterns:
        text=str(text).replace(pat, target)
  
  return text 

In [None]:
final_tweets = [] 
for line in tqdm_notebook(df['processed_tweets'], total=1594993): 
    final_tweets.append(clean_text(line))

In [None]:
final_tweets[:5]

In [None]:
max_features=222342
maxpadlen = 170         
val_split = 0.05
embedding_dim_fasttext = 300

In [None]:
#Tokenization
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(final_tweets))
list_tokenized_train = tokenizer.texts_to_sequences(final_tweets)

In [None]:
#Indexing
word_index=tokenizer.word_index
print(len(word_index))

In [None]:
#padding
training_padded=pad_sequences(list_tokenized_train, maxlen=maxpadlen, padding = 'post')

In [None]:
#Splitting data into Training and Validation Set

x_train, x_val, y_train, y_val = train_test_split(training_padded, y, test_size=0.05, random_state=42)

In [None]:
embeddings_index_fasttext = {}
f = open('/content/drive/MyDrive/projects/Kaggle competition - jigsaw/wiki-news-300d-1M.vec', encoding='utf8')
for line in f:
    line.encode('utf-8').strip()
    values = line.split()
    word = values[0]
    embeddings_index_fasttext[word] = np.asarray(values[1:], dtype='float32')
f.close()
embedding_matrix_fasttext = np.random.random((len(word_index) + 1, embedding_dim_fasttext))
for word, i in word_index.items():
    embedding_vector = embeddings_index_fasttext.get(word)
    if embedding_vector is not None:
        embedding_matrix_fasttext[i] = embedding_vector

**LSTM CNN MODEL**

In [None]:
model_1 = tf.keras.Sequential([
	tf.keras.layers.Embedding(len(word_index) + 1,
                           embedding_dim_fasttext,
                           weights = [embedding_matrix_fasttext],
                           input_length = maxpadlen,
                           trainable=False,
                           name = 'embeddings'),
  tf.keras.layers.Input(shape=(maxpadlen, ),dtype='int32'),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.2, return_sequences=True)),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.1, return_sequences=True)),
  tf.keras.layers.Conv1D(100, 5, activation='relu'),
  tf.keras.layers.GlobalMaxPooling1D(),
  tf.keras.layers.Dropout(.2),
  tf.keras.layers.Dense(30, activation='relu', kernel_initializer='he_uniform'),
	tf.keras.layers.Dropout(.1),
	tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer='glorot_uniform')
])
model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_1.summary()

In [None]:
history = model_1.fit(x_train,y_train, epochs=10, batch_size=1024,  validation_data=(x_val, y_val))

In [None]:
acc,  val_acc  = history.history['accuracy'], history.history['val_accuracy']
loss, val_loss = history.history['loss'], history.history['val_loss']
epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

def ConfusionMatrix(y_pred, y_test):
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)

    categories  = ['Negative','Positive']
    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
                xticklabels = categories, yticklabels = categories)

    plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
    plt.ylabel("Actual values"   , fontdict = {'size':14}, labelpad = 10)
    plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

In [None]:
# Predicting on the Test dataset.
y_pred = model_1.predict(x_val)

# Converting prediction to reflect the sentiment predicted.
y_pred = np.where(y_pred>=0.5, 1, 0)

# Printing out the Evaluation metrics. 
ConfusionMatrix(y_pred, y_val)

In [None]:
print(classification_report(y_val, y_pred))

In [None]:
import pickle
tf.keras.models.save_model(model_1, 'fast_text_model')
pickle.dump(tokenizer, open('tokenizer.pickle', 'wb'))

In [None]:
tf.keras.models.save_model(model_1, 'fast_text_model')


In [None]:
!zip -r /content/file.zip /content/fast_text_model