In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

In [3]:
def clear_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^ a-z]', '', text)
    while text.find('  ') != -1:
        text = text.replace('  ', ' ')
    return text

In [4]:
data_train['clean_text']= data_train['Text'].apply(lambda x:clear_text(x))
data_test['clean_text']= data_test['Text'].apply(lambda x:clear_text(x))

In [5]:
data_train['clean_text'] = data_train['clean_text'].apply(lambda x:word_tokenize(x))
data_test['clean_text'] = data_test['clean_text'].apply(lambda x:word_tokenize(x))

In [6]:
def remove_stop_words(text):
    stop_words = stopwords.words('english')
    filtered_text = []
    for w in text:
        if w not in stop_words:
            filtered_text.append(w)
    return filtered_text

In [7]:
data_train['clean_text'] = data_train['clean_text'].apply(lambda x:remove_stop_words(x))
data_test['clean_text'] = data_test['clean_text'].apply(lambda x:remove_stop_words(x))

In [8]:
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(text):
    return [wordnet_lemmatizer.lemmatize(w) for w in text]

data_train['clean_text'] = data_train['clean_text'].apply(lambda x:lemmatizer(x))
data_test['clean_text'] = data_test['clean_text'].apply(lambda x:lemmatizer(x))

In [9]:
def to_text(data):
    text = []
    for i in data:
        for word in i:
            text.append(word)
    return text

text = to_text(data_train['clean_text'])
text += to_text(data_test['clean_text'])
text.sort()
dictionary = []
for i in text:
    if i not in dictionary:
        dictionary.append(i)

In [10]:
def convert_to_numbers(text):
    res = []
    for word in text:
        res.append(dictionary.index(word))
    return res

def convert_to_text(numbers):
    res = []
    for num in numbers:
        res.append(dictionary[num])
    return res

In [11]:
data_train['numbers'] = data_train['clean_text'].apply(lambda x:convert_to_numbers(x))
data_test['numbers'] = data_test['clean_text'].apply(lambda x:convert_to_numbers(x))

In [12]:
#good: happy, love, surprise
#bad: sadness, anger, fear

def t_or_f(emotion):
    if emotion == 'happy' or emotion == 'love' or emotion == 'surprise':
        return 1        
    else:        #elif emotioin == 'sadness' or 'anger' or 'fear': return 0
        return 0
    
data_train['Emotion_in_digit'] = data_train['Emotion'].apply(lambda x:t_or_f(x))
data_test['Emotion_in_digit'] = data_test['Emotion'].apply(lambda x:t_or_f(x))

# data_train[['Emotion','Emotion_in_digit']]

In [13]:
data_train = data_train[['Emotion_in_digit', 'numbers']]
data_test = data_test[['Emotion_in_digit', 'numbers']]

In [14]:
def vectorize_sequences(sequences, dimension=30000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        for j in sequence:
            results[i, j] = 1.
    return results

In [15]:
x_train = vectorize_sequences(data_train['numbers'])
x_test = vectorize_sequences(data_test['numbers'])
y_train = np.asarray(data_train['Emotion_in_digit']).astype("float32")
y_test = np.asarray(data_test['Emotion_in_digit']).astype("float32")

In [16]:
max_features = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review

In [17]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 128)         2560000   
                                                                 
 bidirectional (Bidirectiona  (None, None, 128)        98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 2,757,761
Trainable params: 2,757,761
Non-train

In [18]:
(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(
    num_words=max_features
)
print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

25000 Training sequences
25000 Validation sequences


In [20]:
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
model.fit(x_train, y_train, batch_size=32, epochs=5, validation_data=(x_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x214029696c8>

In [22]:
score = model.evaluate(x_test, y_test, verbose=1) 
print('Test score:', score[0]) 
print('Test accuracy:', score[1])

Test score: 0.8032044172286987
Test accuracy: 0.42318257689476013
