In [1]:
import numpy as np
import pandas as pd
import keras
from keras.preprocessing.sequence import pad_sequences
import random
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from matplotlib import pyplot as plt

Using TensorFlow backend.


In [None]:
df = pd.read_csv("hybridclean.csv")

In [None]:
df.rename(columns={'headline':'text'},inplace=True)

In [None]:
def stem_text(word_list):  #Lemmatize text
    new_list = []
    for word in word_list:
        wl = PorterStemmer()
        rootword=wl.stem(word)
        new_list.append(rootword)
    return new_list


def remove_stopwords(row):     #Remove Stopwords from text
    tokens = word_tokenize(row)
    filtered_word = [word for word in tokens if not word in stop_words]
    stemmed_text = stem_text(filtered_word)
    return stemmed_text


def make_string(row):      #Convert list into string
    new = ''
    for word in row:
        new = new + word
        new = new + ' '
    return new


def clean_text(text):
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    return text

df['statement_clean'] = ''
df['statement_clean'] = df['text'].apply(lambda x: x.lower())
df['statement_clean'] = df['statement_clean'].map(lambda x:clean_text(x))
stop_words = set(stopwords.words('english'))
df['statement_clean'] = df['statement_clean'].apply(remove_stopwords)
df['statement_clean'] = df['statement_clean'].apply(make_string)
df['statement_clean'].replace(to_replace = '[^\w\s]',value='',inplace = True,regex = True )  #Remove punctuation
df['statement_clean'].replace(to_replace = '[\d]',value='',inplace = True,regex = True )     #Remove digits
df['statement_clean'] = df['statement_clean'].apply(lambda x:" ".join(x.split()))            #Remove duplicate white spaces
df['statement_clean'] = df['statement_clean'].apply(lambda x:" ".join(word for word in x.split(' ') if len(word) > 3))   #Remove single letter words
print(df['statement_clean'])

In [None]:
len(df)

In [None]:
len(ff)

In [None]:
df=ff

In [None]:
df.dropna(subset=['statement_clean'],inplace=True)

In [None]:
input_sentences = [text.split(" ") for text in df["statement_clean"].values.tolist()]
labels = df["emotions"].values.tolist()
labels

In [None]:
word2id = dict()
label2id = dict()

max_words = 0 


for sentence in input_sentences:
    for word in sentence:
      
        if word not in word2id:
            word2id[word] = len(word2id)
    
    if len(sentence) > max_words:
        max_words = len(sentence)
    

label2id = {l: i for i, l in enumerate(set(labels))}
id2label = {v: k for k, v in label2id.items()}
id2label

In [None]:

X = [[word2id[word] for word in sentence] for sentence in input_sentences]
Y = [label2id[label] for label in labels]


X = pad_sequences(X, max_words)


Y = keras.utils.to_categorical(Y, num_classes=len(label2id), dtype='float32')

# Print shapes
print("Shape of X: {}".format(X.shape))
print("Shape of Y: {}".format(Y.shape))

In [None]:
print(X)
print(Y)

In [None]:
print(max_words)

In [None]:
id2label

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test =  train_test_split(X, Y,test_size =0.20,random_state= 4 )

# The below cell will require a nvidia GPU to run with CUDA support

In [None]:
embedding_dim = 100 


sequence_input = keras.Input(shape=(max_words,), dtype='int32')


embedded_inputs =keras.layers.Embedding(len(word2id) + 1,
                                        embedding_dim,
                                        input_length=max_words)(sequence_input)


embedded_inputs = keras.layers.Dropout(0.2)(embedded_inputs)


lstm_outs = keras.layers.wrappers.Bidirectional(
    keras.layers.CuDNNLSTM(embedding_dim, return_sequences=True)
)(embedded_inputs)


lstm_outs = keras.layers.Dropout(0.2)(lstm_outs)

input_dim = int(lstm_outs.shape[2])
permuted_inputs = keras.layers.Permute((2, 1))(lstm_outs)
attention_vector = keras.layers.TimeDistributed(keras.layers.Dense(1))(lstm_outs)
attention_vector = keras.layers.Reshape((max_words,))(attention_vector)
attention_vector = keras.layers.Activation('softmax', name='attention_vec')(attention_vector)
attention_output = keras.layers.Dot(axes=1)([lstm_outs, attention_vector])

fc = keras.layers.Dense(embedding_dim, activation='relu')(attention_output)
output = keras.layers.Dense(len(label2id), activation='softmax')(fc)

model = keras.Model(inputs=[sequence_input], outputs=output)
model.compile(loss="categorical_crossentropy", metrics=["accuracy"], optimizer='adam')

In [None]:
print(sequence_input)
print(embedded_inputs)
print(lstm_outs)

In [None]:
model.summary()

In [None]:
model.save("emotionmodel.h5")
print("Model saved to disk")

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [None]:
model.fit(X, Y, epochs=4, batch_size=64, validation_split=0.1, shuffle=True)

# We use the above saved bi-lstm model to predict emotion in a text. 
# Read the required dataset in the next cell. 

In [None]:

ff = pd.read_csv("../../dataset.csv")

In [None]:
def stem_text(word_list):  #Lemmatize text
    new_list = []
    for word in word_list:
        wl = PorterStemmer()
        rootword=wl.stem(word)
        new_list.append(rootword)
    return new_list


def remove_stopwords(row):     #Remove Stopwords from text
    tokens = word_tokenize(row)
    filtered_word = [word for word in tokens if not word in stop_words]
    stemmed_text = stem_text(filtered_word)
    return stemmed_text


def make_string(row):      #Convert list into string
    new = ''
    for word in row:
        new = new + word
        new = new + ' '
    return new


def clean_text(text):
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    return text

ff['statement_clean'] = ''
ff['statement_clean'] = ff['review'].apply(lambda x: x.lower())
ff['statement_clean'] = ff['statement_clean'].map(lambda x:clean_text(x))
stop_words = set(stopwords.words('english'))
ff['statement_clean'] = ff['statement_clean'].apply(remove_stopwords)
ff['statement_clean'] = ff['statement_clean'].apply(make_string)
ff['statement_clean'].replace(to_replace = '[^\w\s]',value='',inplace = True,regex = True )  #Remove punctuation
ff['statement_clean'].replace(to_replace = '[\d]',value='',inplace = True,regex = True )     #Remove digits
ff['statement_clean'] = ff['statement_clean'].apply(lambda x:" ".join(x.split()))            #Remove duplicate white spaces
ff['statement_clean'] = ff['statement_clean'].apply(lambda x:" ".join(word for word in x.split(' ') if len(word) > 3))   #Remove single letter words
print(ff['statement_clean'])

In [None]:
input_sentences = [text.split(" ") for text in ff["statement_clean"].values.tolist()]


for sentence in input_sentences:
    for word in sentence:
        if word not in word2id:
            word2id[word] = len(word2id)
   
        
X_test = [[word2id[word] for word in sentence] for sentence in input_sentences]

X_test = pad_sequences(X_test, max_words)


In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
preds = model.predict(X_test)

In [None]:
preds = np.argmax(preds, axis=1)
preds

In [None]:
print(preds)

In [None]:
len(ff)

In [None]:
len(preds)

In [None]:
final_preds = [id2label[pred] for pred in preds ]

In [None]:
ff.reset_index(inplace=True)

In [None]:
ff['emotion_new'] = pd.Series(final_preds)

In [None]:
ff

In [None]:
ff.to_csv("finaldataset.csv",index=False)