In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN, SpatialDropout1D

from sklearn.model_selection import train_test_split
from sklearn import metrics

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("/kaggle/input/twitter-airline-sentiment/Tweets.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data = data[data['airline_sentiment_confidence'] > 0.6]

In [None]:
data.shape

In [None]:
data = data[['text', 'airline_sentiment']]
data.head()

In [None]:
def clean_train_data(x):
    text = x
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) # remove square brackets
    text = re.sub(r'[^\w\s]','',text) # remove punctuation
    text = re.sub('\w*\d\w*', '', text) # remove words containing numbers
    text = re.sub('\n', '', text)
    return text

In [None]:
data['text'] = data.text.apply(lambda x : clean_train_data(x))
data.head()

In [None]:
all_cat_data = data.copy()

# 2 class sentiment Analysis

In [None]:
data = data[data['airline_sentiment'] != 'neutral']
data.head()

In [None]:
print(len(data[data['airline_sentiment'] == 'positive']))
print(len(data[ data['airline_sentiment'] == 'negative']))
print(len(data[ data['airline_sentiment'] == 'neutral']))


In [None]:
model1_data = data.copy()

In [None]:
max_features = 2000
token = Tokenizer(num_words=max_features, split = ' ')
token.fit_on_texts(data['text'].values)

X = token.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [None]:
X.shape

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
Y = pd.get_dummies(data['airline_sentiment']).values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.33, random_state=42)

In [None]:
batch_size = 32
history = model.fit(X_train, y_train, epochs=10, batch_size=batch_size, verbose=2)

In [None]:
# score = model.predict(X_test)
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=2)
print('score', score)
print('accuracy', acc)

In [None]:
text = ['i would recommend it if you have no other options']
text = token.texts_to_sequences(text)
text = pad_sequences(text, maxlen=28, dtype='int32', value=0)
res = model.predict(text, batch_size=1,verbose = 2)
res

In [None]:
if np.argmax(res[0]) == 0:
    print("Negetive Comment")
elif np.argmax(res[0]) == 1:
    print("Psetive Comment")

# Multiclass Sentiment Analysis

## data preprocessing

In [None]:
ms_data = all_cat_data.copy()

In [None]:
ms_data.head()

In [None]:
num_of_rows = 4000
shuffled = ms_data.reindex(np.random.permutation(ms_data.index))
nt = shuffled[shuffled['airline_sentiment'] == 'neutral'][:num_of_rows]
ng = shuffled[shuffled['airline_sentiment'] == 'negative'][:num_of_rows]
ps = shuffled[shuffled['airline_sentiment'] == 'positive'][:num_of_rows]
combine_data = pd.concat([nt, ng, ps], ignore_index=True)
combine_data = combine_data.reindex(np.random.permutation(combine_data.index))
combine_data['label'] = 0
combine_data.head()

In [None]:
print(len(combine_data[combine_data['airline_sentiment'] == 'neutral']))
print(len(combine_data[combine_data['airline_sentiment'] == 'negative']))
print(len(combine_data[combine_data['airline_sentiment'] == 'positive']))

In [None]:
ms_data = combine_data.copy()

In [None]:
print(len(ms_data[ms_data['airline_sentiment'] == 'neutral']))
print(len(ms_data[ms_data['airline_sentiment'] == 'negative']))
print(len(ms_data[ms_data['airline_sentiment'] == 'positive']))

In [None]:
ms_data.loc[ms_data['airline_sentiment'] == 'neutral', 'label'] = 0
ms_data.loc[ms_data['airline_sentiment'] == 'negative', 'label'] = 1
ms_data.loc[ms_data['airline_sentiment'] == 'positive', 'label'] = 2

In [None]:
ms_data.head(10)

In [None]:
from keras.utils import to_categorical

In [None]:
labels = to_categorical(ms_data['label'], num_classes=3)

In [None]:
labels.shape

In [None]:
labels[:10]

## token

In [None]:
max_features = 3000
max_len = 130
ms_token = Tokenizer(num_words=max_features)
ms_token.fit_on_texts(ms_data['text'].values)
ms_sequences = ms_token.texts_to_sequences(ms_data['text'].values)
X = pad_sequences(ms_sequences, maxlen=max_len)

In [None]:
word_index = ms_token.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
y = labels

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [None]:
embed_dim = 128
lstm_out = 96

In [None]:
ms_model = Sequential()
ms_model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
ms_model.add(SpatialDropout1D(0.7))
ms_model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
ms_model.add(Dense(3, activation='softmax'))
ms_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

ms_model.summary()

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
batch_size = 50
ms_history = ms_model.fit(X_train, y_train, epochs=20, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

In [None]:
loss, accuracy = ms_model.evaluate(X_test, y_test)
print("loss", loss)
print("accuracy", accuracy)

In [None]:
text = ['i would recommend it if you have no other options']
text = token.texts_to_sequences(text)
text = pad_sequences(text, maxlen=max_len)
res = ms_model.predict(text)
res

In [None]:
if np.argmax(res) == 0:
    print("neutral Comment")
elif np.argmax(res) == 1:
    print("Negetive Comment")
elif np.argmax(res) == 2:
    print("positive Comment")

# Another Solution with Embedding Glove

In [None]:
em_data = all_cat_data.copy()

In [None]:
em_data.head()

In [None]:
# em_data = em_data[em_data['airline_sentiment'] != 'neutral']
# em_data.head()

In [None]:
num_of_rows = 4000
shuffled = em_data.reindex(np.random.permutation(em_data.index))
nt = shuffled[shuffled['airline_sentiment'] == 'neutral'][:num_of_rows]
ng = shuffled[shuffled['airline_sentiment'] == 'negative'][:num_of_rows]
ps = shuffled[shuffled['airline_sentiment'] == 'positive'][:num_of_rows]
combine_data = pd.concat([nt, ng, ps], ignore_index=True)
combine_data = combine_data.reindex(np.random.permutation(combine_data.index))
combine_data['label'] = 0
combine_data.head()

In [None]:
print(len(combine_data[combine_data['airline_sentiment'] == 'neutral']))
print(len(combine_data[combine_data['airline_sentiment'] == 'negative']))
print(len(combine_data[combine_data['airline_sentiment'] == 'positive']))

In [None]:
ms_data = combine_data.copy()

In [None]:
print(len(ms_data[ms_data['airline_sentiment'] == 'neutral']))
print(len(ms_data[ms_data['airline_sentiment'] == 'negative']))
print(len(ms_data[ms_data['airline_sentiment'] == 'positive']))

In [None]:
ms_data.loc[ms_data['airline_sentiment'] == 'neutral', 'label'] = 0
ms_data.loc[ms_data['airline_sentiment'] == 'negative', 'label'] = 1
ms_data.loc[ms_data['airline_sentiment'] == 'positive', 'label'] = 2

In [None]:
ms_data.head(10)

In [None]:
from keras.utils import to_categorical

In [None]:
labels = to_categorical(ms_data['label'], num_classes=3)

In [None]:
labels.shape

In [None]:
labels[:10]

In [None]:
# Y = pd.get_dummies(em_data['airline_sentiment']).values

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(ms_data.text.values, labels, 
                                                  stratify=labels, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [None]:
from tqdm import tqdm

In [None]:
embeddings_index = {}
f = open('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
    
f.close()

In [None]:
# token = Tokenizer(num_words=None)
# word_index = token.word_index

In [None]:
# using keras tokenizer here
token = Tokenizer(num_words=None)
max_len = 1500

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

#zero pad the sequences
xtrain_pad = pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index
# word_index

In [None]:
# word_index

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# embedding_matrix

In [None]:
from keras.layers import Bidirectional

In [None]:
embed_dim = 128
model = Sequential()
model.add(Embedding(
    len(word_index) + 1,
    300,
    weights=[embedding_matrix],
    input_length=embed_dim,
    trainable=False
))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(3, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# model.fit(xtrain_pad, ytrain, nb_epoch=5, batch_size=10)
batch_size = 32
history = model.fit(xtrain_pad, ytrain, epochs=5, batch_size=batch_size)

In [None]:
# score = model.predict(X_test)
score, acc = model.evaluate(xvalid_pad, yvalid, batch_size=batch_size)
print('score', score)
print('accuracy', acc)

In [None]:
text = ['i would recommend it if you have no other options']
text = token.texts_to_sequences(text)
text = pad_sequences(text, maxlen=28, dtype='int32', value=0)
res = model.predict(text, batch_size=1,verbose = 2)
res

In [None]:
if np.argmax(res) == 0:
    print("neutral Comment")
elif np.argmax(res) == 1:
    print("Negetive Comment")
elif np.argmax(res) == 2:
    print("positive Comment")

# Acknowledgement:
* https://www.kaggle.com/ngyptr/multi-class-classification-with-lstm
* https://www.kaggle.com/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert
* https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras