# Validate API Data

Validate and create a emotion labeled dataset

In [1]:
import json
from pathlib import Path

## Load Relations

Load the relations between queries and emotions

In [2]:
relations_path = Path('../query_relations.json').resolve()

In [3]:
with relations_path.open('rb') as file:
    relations = json.load(file)

## Load Tokenizer

Load the tokenizer, created at the model training process

In [4]:
import pickle

In [5]:
tokenizer_path = Path('../datasets/sentiment140/tokenizer.pickle').resolve()
with tokenizer_path.open('rb') as file:
    tokenizer = pickle.load(file)

## Load Model

Load the model, using the saved weights

In [6]:
from tensorflow.keras.layers import Input, Embedding, GRU
from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D
from tensorflow.keras.layers import Bidirectional, Dense
from tensorflow.keras.models import Sequential

In [7]:
input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)
embedding_dim = 200
input_length = 100
gru_units = 128
gru_dropout = 0.1
recurrent_dropout = 0.1
dropout = 0.1

In [8]:
model = Sequential()
model.add(Embedding(
    input_dim=input_dim,
    output_dim=embedding_dim,
    input_shape=(input_length,)
))

model.add(Bidirectional(GRU(
    gru_units,
    return_sequences=True,
    dropout=gru_dropout,
    recurrent_dropout=recurrent_dropout
)))
model.add(GlobalMaxPooling1D())
model.add(Dense(32, activation='relu'))
model.add(Dropout(dropout))

model.add(Dense(1, activation='sigmoid'))

W0716 13:57:36.488991 140540894373696 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0716 13:57:36.503060 140540894373696 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0716 13:57:36.506539 140540894373696 deprecation.py:506] From /home/rmohashi/anaconda3/envs/emodata/lib/python3.6/site-packages/tensorflow/python/op

In [9]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          2000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 256)          252672    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                8224      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 2,260,929
Trainable params: 2,260,929
Non-trainable params: 0
______________________________________________

In [10]:
weights_path = Path('../models/sentiment_analysis/gru_model.h5').resolve()
model.load_weights(weights_path.as_posix())

## Group data by emotion

Use the emotions to group the data

In [11]:
import os
import re
import pandas as pd

In [12]:
files_dir = Path('../datasets/tweepy').resolve()

In [13]:
emotion_data_dict = {}

for filename in os.listdir(files_dir):
    print('Reading file: "' + filename + '"')

    query = re.findall(r'(#[^.]+|:.+:)', filename)[0]
    emotion = relations[query]

    file_data = pd.read_csv(os.path.join(files_dir, filename))
    dict_data = emotion_data_dict[emotion] if emotion in emotion_data_dict else None
    emotion_data_dict[emotion] = pd.concat([dict_data, file_data])

Reading file: "#angry.csv"


## Predict emotion and filter data

Predict emotion and filter rows for each group created in the step above

In [14]:
import re
import nltk
import numpy as np
from emoji import demojize
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rmohashi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
def get_score_range(mean):
  if mean < 0.5:
    return (0.0, mean)
  return (mean, 1.0)

In [16]:
def preprocess(texts):
    # Lowercasing
    texts = texts.str.lower()

    # Remove special chars
    texts = texts.str.replace(r"(http|@)\S+", "")
    texts = texts.apply(demojize)
    texts = texts.str.replace(r"::", ": :")
    texts = texts.str.replace(r"â€™", "'")
    texts = texts.str.replace(r"[^a-z\':_]", " ")

    # Remove repetitions
    pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
    texts = texts.str.replace(pattern, r"\1")

    # Transform short negation form
    texts = texts.str.replace(r"(can't|cannot)", 'can not')
    texts = texts.str.replace(r"n't", ' not')

    # Remove stop words
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.remove('not')
    stopwords.remove('nor')
    stopwords.remove('no')
    texts = texts.apply(
        lambda x: ' '.join([word for word in x.split() if word not in stopwords])
    )

    return texts

In [17]:
result_data = []

for emotion, dataset in emotion_data_dict.items():
    print('Processing "' + emotion + '" data...')

    cleaned_texts = preprocess(dataset.text)
    predict_sequences = [text.split() for text in cleaned_texts]
    list_tokenized_predict = tokenizer.texts_to_sequences(predict_sequences)
    x_predict = pad_sequences(list_tokenized_predict, maxlen=100)

    result = model.predict(x_predict)
    mean = np.mean(result)
    std = np.std(result)
    low, high = get_score_range(mean)
    print("\tScore Range: {:4f} - {:4f}".format(low, high))
    dataset = dataset[np.all([(result >= low), (result <= high)], axis=0)]
    dataset.insert(0, 'label', emotion)

    result_data = result_data + [dataset]

Processing "angry" data...
	Score Range: 0.000000 - 0.320651


## Save dataset

Save the resulting data

In [18]:
if len(result_data) > 0:
    result_data = pd.concat(result_data)

    path = Path('../datasets/sentiment_analysis/dataset.csv').resolve()
    result_data.to_csv(path, index=None)

    print('Files saved under "' + path.as_posix() + '"')

Files saved under "/home/rmohashi/Workspace/emotion-from-tweets/datasets/sentiment_analysis/dataset.csv"
