#     **Data Pre-Processing, NLP Techniques, applications of LSTM & CNN Models**


## **Data Cleaning**
---

In [0]:
  # Read file from gdrive
import pandas as pd
import numpy as np
df=pd.read_csv('drive/My Drive/August01_Tweets_Final.csv')
df=df.drop_duplicates().reset_index()

In [0]:
df.head(5)

In [0]:
# Remove html
from bs4 import BeautifulSoup
def remove_html(text):
  soup=BeautifulSoup(text, 'lxml')
  html_free=soup.get_text()
  return html_free

In [0]:
# Remove Punctuation
import string
def remove_punctuation(text):
  no_punct = "".join([c for c in text if c not in string.punctuation])
  return no_punct

In [0]:
# Remove \n character
df['tweet']= df['tweet'].apply(lambda x:remove_punctuation(x)).apply(lambda x: x.replace('\n', ' ')).apply(lambda x:remove_html(x))

In [0]:
# Instantiate Tokenizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [0]:
df['tweet']= df['tweet'].apply(lambda x:tokenizer.tokenize(x.lower()))

In [0]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [0]:
# Remove Stop words
from nltk.corpus import stopwords
def remove_stopwords(text):
  words = [w for w  in text if w not in stopwords.words('english')]
  return words

In [0]:
df['tweet']=df['tweet'].apply(lambda x:remove_stopwords(x))

In [0]:
# Lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [0]:
def word_lemmatizer(text):
  lem_text = [lemmatizer.lemmatize(i) for i in text]
  return lem_text

In [0]:
df['tweet']=df['tweet'].apply(lambda x:word_lemmatizer(x))

In [0]:
# stemming
from  nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def word_stemmer(text):
  stem_text = " ".join([stemmer.stem(i) for i in text])
  return stem_text

In [0]:
df['tweet']=df['tweet'].apply(lambda x:word_stemmer(x))

In [0]:
#!pip install emoji
#!pip install nrclex

In [0]:
# removing '&gt;'
df['tweet'] = df['tweet'].apply(lambda x: x.replace('&gt;', ''))
# Remove Hyperlinks
df['tweet']= df['tweet'].str.replace('http\S+|www.\S+', '', case=False)
# remove ' s ' that was created after removing punctuations
df['tweet'] = df['tweet'].apply(lambda x: str(x).replace(" s ", " "))

In [0]:
# Emoji removal
import emoji
#remove the emoji
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [0]:
df['tweet'] = df['tweet'].apply(lambda x: deEmojify(x))

In [0]:
from nrclex import NRCLex
senti_scores_list = []
max_key = []

for words in df['tweet']:
  senti_scores = NRCLex(words)
  senti_scores_list.append(senti_scores.raw_emotion_scores)

for a in senti_scores_list:
  if a != {}:
    max_key.append(max(a, key=a.get))
  else:
    max_key.append('no_sentiment')

In [0]:
df['sentiment'] = max_key

In [0]:
# Store cleaned_data in drive
df.to_csv('cleaned_data.csv')
!cp cleaned_data.csv "drive/My Drive/"

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(font_scale=1.4)
df['sentiment'].value_counts().plot(kind='bar', figsize=(20, 6), rot=0)
plt.xlabel("sentiment", labelpad=14)
plt.ylabel("Count of Sentiment", labelpad=14)
plt.title("Count of Sentiments by Category", y=1.02);

In [0]:
cleaned_data = df[(df['sentiment']!='anticip')].reset_index()
del cleaned_data['level_0']
del cleaned_data['index']

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(font_scale=1.4)
cleaned_data['sentiment'].value_counts().plot(kind='bar', figsize=(20, 6), rot=0)
plt.xlabel("sentiment", labelpad=14)
plt.ylabel("Count of Sentiment", labelpad=14)
plt.title("Count of Sentiments by Category", y=1.02);

In [0]:
# Store cleaned_data in drive
cleaned_data.to_csv('final_cleaned_data.csv', index=False)
!cp final_cleaned_data.csv "drive/My Drive/"

## **Building LSTM Model**

---



In [0]:
# Read file from gdrive
import pandas as pd
import numpy as np
data=pd.read_csv('drive/My Drive/final_cleaned_data.csv')

In [0]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

In [0]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data['tweet'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [0]:
from keras.preprocessing.sequence import pad_sequences
X = tokenizer.texts_to_sequences(data['tweet'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [0]:
#Y = pd.get_dummies(data['sentiment']).values
from sklearn.preprocessing import LabelEncoder
mlb = LabelEncoder()
sentiment = data['sentiment'].to_numpy()
mlb.fit(sentiment)
Y = mlb.transform(sentiment)
print('Shape of label tensor:', Y.shape)

In [0]:
# DataSplit
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, stratify=Y, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [0]:
# Build Model
from keras.models import Sequential
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding,SpatialDropout1D
from keras.layers.recurrent import LSTM
from keras.callbacks import ModelCheckpoint, EarlyStopping
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(10, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [0]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [0]:
import matplotlib.pyplot as plt
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [0]:
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();

In [0]:
# predictions for test data
predictions = model.predict_classes(X_test, 
                            batch_size=100, 
                            verbose=1)

In [0]:
#confusion matrix for the test data
def plot_confusion_matrix(cm, classes,
                          normalize=True,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])*100

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

# confusion matrix creation
#from sklearn.metrics import confusion_matrix
#labels = mlb.classes_
#cnf_matrix = confusion_matrix(y_true, y_pred,labels=labels)
#np.set_printoptions(precision=2)
y_true = mlb.inverse_transform(Y_test)
y_pred = mlb.inverse_transform(predictions)

import scikitplot as skplt
# Plot confusion matrix
#plt.figure()
skplt.metrics.plot_confusion_matrix(y_true, y_pred,
                     title='Confusion matrix', figsize = (20,15), text_fontsize='medium', cmap='Reds')

In [0]:
#! pip install scikit-plot

## **Building CNN Model**

---


In [0]:
# Read file from gdrive
import pandas as pd
import numpy as np
data = pd.read_csv('drive/My Drive/final_cleaned_data.csv')

In [0]:
# Train-Test Split
from sklearn.model_selection import train_test_split
training_bs, test = train_test_split(data, 
                                         test_size=0.10, 
                                         random_state=42)

In [0]:
training_bs.loc[training_bs['sentiment']=='positive', 'sentiment_score'] = int(1)
training_bs.loc[training_bs['sentiment']=='negative', 'sentiment_score'] = int(2)
training_bs.loc[training_bs['sentiment']=='no_sentiment', 'sentiment_score'] = int(3)
training_bs.loc[training_bs['sentiment']=='sadness', 'sentiment_score'] = int(4)
training_bs.loc[training_bs['sentiment']=='fear', 'sentiment_score'] = int(5)
training_bs.loc[training_bs['sentiment']=='trust', 'sentiment_score'] = int(6)
training_bs.loc[training_bs['sentiment']=='anger', 'sentiment_score'] = int(7)
training_bs.loc[training_bs['sentiment']=='surprise', 'sentiment_score'] = int(8)
training_bs.loc[training_bs['sentiment']=='joy', 'sentiment_score'] = int(9)
training_bs.loc[training_bs['sentiment']=='disgust', 'sentiment_score'] = int(10)

In [0]:
## build training vocabulary and get maximum training sentence length and total number of words training data
all_training_words = ''.join([ word for tokens in training_bs["tweet"] for word in tokens])
training_sentence_lengths = [len(tokens) for tokens in training_bs["tweet"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))

In [0]:
## Build testing vocabulary and get maximum testing sentence length and total number of words in testing data

all_test_words = ''.join([word for tokens in test["tweet"] for word in tokens])
test_sentence_lengths = [len(tokens) for tokens in test["tweet"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

In [0]:
## Loading Google News Word2Vec model
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
from gensim.models import KeyedVectors
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [0]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)

In [0]:
# Tokenization
tokenizer.fit_on_texts(training_bs["tweet"].tolist())
training_sequences = tokenizer.texts_to_sequences(training_bs["tweet"].tolist())
train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

In [0]:
MAX_SEQUENCE_LENGTH = 500
EMBEDDING_DIM = 300

In [0]:
from keras.preprocessing.sequence import pad_sequences
train_cnn_data = pad_sequences(training_sequences, 
                               maxlen=MAX_SEQUENCE_LENGTH)

In [0]:
train_embedding_weights = np.zeros((len(train_word_index)+1, 
 EMBEDDING_DIM))

In [0]:
# Word2Vec word Embedding
for word,index in train_word_index.items():
 train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

In [0]:
# Build Model
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
 
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    convs = []
    filter_sizes = [2,3,4,5,6]
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=300, 
                        kernel_size=filter_size, 
                        activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)
    l_merge = concatenate(convs, axis=1)
    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='softmax')(x)
    x = Dropout(0.5)(x)
    preds = Dense(labels_index, activation='softmax')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [0]:
label_names = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]

In [0]:
x_train = train_cnn_data
y_tr = training_bs['sentiment_score'].values

In [0]:
# Model Function
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.models import Model
model = ConvNet(train_embedding_weights, 
                MAX_SEQUENCE_LENGTH, 
                len(train_word_index)+1, 
                EMBEDDING_DIM, 
                len(list(label_names)))

In [0]:
num_epochs = 5
batch_size = 64

In [0]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

In [0]:
test.loc[test['sentiment']=='positive', 'sentiment_score'] = int(1)
test.loc[test['sentiment']=='negative', 'sentiment_score'] = int(2)
test.loc[test['sentiment']=='no_sentiment', 'sentiment_score'] = int(3)
test.loc[test['sentiment']=='sadness', 'sentiment_score'] = int(4)
test.loc[test['sentiment']=='fear', 'sentiment_score'] = int(5)
test.loc[test['sentiment']=='trust', 'sentiment_score'] = int(6)
test.loc[test['sentiment']=='anger', 'sentiment_score'] = int(7)
test.loc[test['sentiment']=='surprise', 'sentiment_score'] = int(8)
test.loc[test['sentiment']=='joy', 'sentiment_score'] = int(9)
test.loc[test['sentiment']=='disgust', 'sentiment_score'] = int(10)

In [0]:
test_sequences = tokenizer.texts_to_sequences(test["tweet"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [0]:
predictions = model.predict(test_cnn_data, 
                            batch_size=1024, 
                            verbose=1)
labels = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])
sum(test.sentiment_score==prediction_labels)/len(prediction_labels)

In [0]:
#confusion matrix for the test data
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
mlb = LabelEncoder()
def plot_confusion_matrix(cm, classes,
                          normalize=True,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])*100

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

# confusion matrix creation
#y_true = mlb.inverse_transform(y_tr)
#y_pred = mlb.inverse_transform(predictions)

import scikitplot as skplt
# Plot confusion matrix
#plt.figure()
skplt.metrics.plot_confusion_matrix(test['sentiment_score'], prediction_labels,
                     title='Confusion matrix', figsize = (20,15), text_fontsize='medium', cmap='Reds')