<a href="https://colab.research.google.com/github/sauravsingla/Multi-Class-Sentiment-Classification-/blob/main/twitter_bilstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import os
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#########PREPROCESSING PART #################################################
df = pd.read_csv("/content/drive/My Drive/NLP/text_emotion.csv")
df = df.drop(["tweet_id","author"],axis = 1)
print(df.head())

df['target'] = df['sentiment'].map({'sadness': 0, 'boredom':1,'neutral':2,'worry':3,'surprise':4,'love':5,'fun':6,'hate':7,'happiness':8,'anger':9,'relief':10,'enthusiasm':11,'empty':12})
print(df.head())

df = df.drop(["sentiment"],axis=1)
print(df)

In [None]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"didn't", "did not", text)
    return text

In [None]:
cleaned_text = []
for text in df['content']:
    cleaned_text.append(clean_text(text))
df['clean'] = cleaned_text
df.head()

In [None]:
tw = []
for j in df['clean']:
  tweets = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", str(j)).split())
  tw.append(tweets)
df['clean'] = tw
df.head()

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = set(stopwords.words('english'))
output = []
for sentence in df["clean"]:
    temp_list = []
    for word in sentence.split():
      if len(word) > 2 not in stopwords:
        temp_list.append(word)
    output.append(' '.join(temp_list))
    
df["texts"] = output
df = df.drop(['content','clean'], axis = 1)
df.head()

In [None]:
lemmatizer = WordNetLemmatizer()

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

def lemmatize_text(texts):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(texts)]


df['text_lemmatized'] = df.texts.apply(lemmatize_text)

sc = [[' '.join(i)] for i in df['text_lemmatized']]
lis = []
for i in sc:
    abc = i[0]
    lis.append(abc)

df['lem'] = lis
df.head()

In [None]:
df = df.drop(['text_lemmatized','texts'], axis =1)
df.head()

In [None]:
df.drop(df[df['target'] == 12].index, inplace = True)
df.drop(df[df['target'] == 11].index, inplace = True)
df.head()

In [None]:
X = df["lem"]
Y = df["target"]

In [None]:
# some configuration
MAX_SEQUENCE_LENGTH = 48
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 100
BATCH_SIZE = 64
EPOCHS = 100

# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open(os.path.join('/content/drive/My Drive/NLP/glove.6B.%sd.txt' % EMBEDDING_DIM),encoding='utf-8') as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

# convert the sentences (strings) into integers
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))


# pad sequences so that we get a N x T matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(data,Y, test_size = 0.3)
Yt = np_utils.to_categorical(Ytrain)

In [None]:
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector
      
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False
)

print('Building model...')

# create an LSTM network with a single LSTM

input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
x = Bidirectional(LSTM(50, return_sequences=True))(x)
x = Dropout(0.2)(x)
x = Bidirectional(LSTM(30, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
output = Dense(11, activation="softmax")(x)

model = Model(input_, output)
model.compile(
  loss='categorical_crossentropy',
  optimizer='adam',
  metrics=['accuracy'],
)

print(model.summary())

In [None]:
print('Training model...')
r = model.fit(
  Xtrain,
  Yt,
  batch_size=BATCH_SIZE,
  epochs=10,
  validation_split=0.4
)


plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

# accuracies
plt.plot(r.history['accuracy'], label='acc')
plt.plot(r.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

In [None]:
p = model.predict(Xtest)
print(p)
p = np.argmax(p, axis=1)
print(p)

cnfv=confusion_matrix(Ytest,p)
print(cnfv)

print(classification_report(Ytest, p))