In [None]:
!pip install transformers >> /dev/null
!pip install datasets >> /dev/null
!pip3 install torch >> /dev/null

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import sys
import string
import json
import spacy
sp = spacy.load('en_core_web_sm')

In [2]:
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input,Embedding,Dense,Flatten
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,classification_report

In [3]:
train_path = './../datasets/Emotion_classification_ISEAR/corrected_isear-train.csv'
val_path = './../datasets/Emotion_classification_ISEAR/corrected_isear-val.csv'
test_path = './../datasets/Emotion_classification_ISEAR/corrected_isear-test.csv'

In [4]:
def read_data(data):
    df = pd.read_csv(data, names=['Label', 'Text', 'NaN'], error_bad_lines=False)
    df.drop(columns=['NaN'], axis=1, inplace=True)
    return df

In [5]:
train = read_data(train_path)
val = read_data(val_path)
test = read_data(test_path)

In [6]:
def transform_label(data):
    mapping = {'joy': 0, 'fear': 1, 'shame': 2, 'disgust': 3, 'sadness': 4, 'anger': 5, 'guilt':6}
    for k,v in mapping.items():
        if k == data:
            return v
    return -1

In [7]:
def remove_puncts(data):
    new_data = re.sub(r'[^\w\s]', '', data)
    return new_data

In [8]:
def remove_nums(data):
    pattern = r'[0-9]'
    new_data = re.sub(pattern, '', data)
    return new_data

In [9]:
def remove_stop_words(data):
    all_stopwords = sp.Defaults.stop_words
    tokens = data.split(" ")
    tokens_filtered= [word for word in tokens if not word in all_stopwords]
    return (" ").join(tokens_filtered)

In [10]:
def lemmatize(data):
    new_string = ''
    doc = sp(data)
    for token in doc:
        new_string= new_string +" "+ str(token.lemma_)
    return new_string

In [11]:
def remove_PRON(data):
    pattern = r'-PRON-'
    new_data = re.sub(pattern, '', data)
    return new_data

In [12]:
def clean_data(data):
    data['Text'] = data['Text'].str.lower()
    data['Text'] = data.apply(lambda x: remove_nums(x['Text']), axis=1)
    data['Text'] = data.apply(lambda x: remove_puncts(x['Text']), axis=1)
    data['Text'] = data.apply(lambda x: remove_stop_words(x['Text']), axis=1)
    data['Text'] = data.apply(lambda x: lemmatize(x['Text']), axis=1)
    data['Text'] = data.apply(lambda x: remove_PRON(x['Text']), axis=1)
    data['Label'] = data.apply(lambda x: transform_label(x['Label']), axis = 1)
    return data

In [13]:
train = clean_data(train)
val = clean_data(val)
test = clean_data(test)

In [14]:
len(train), len(val), len(test)

(5357, 1148, 1148)

In [15]:
train.head()

Unnamed: 0,Label,Text
0,0,understood admit university
1,1,broke window neighbouring house fear mothers ...
2,0,get big fish fishing
3,1,dark room walk street sleep room night partly...
4,2,buy possible answer homework problem complete...


In [16]:
x_train = train['Text'].values
y_train = train['Label'].values
x_val = val['Text'].values
y_val = val['Label']
x_test = test['Text'].values
y_test = test['Label'].values

In [17]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit(x_train)
X_train = vectorizer.transform(x_train)
X_val = vectorizer.transform(x_val)
X_test = vectorizer.transform(x_test)

In [18]:
X_train.shape, X_val.shape, X_test.shape

((5357, 6115), (1148, 6115), (1148, 6115))

In [21]:
def update_data(data) :
    coo = data.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)

In [23]:
X_train = tf.sparse.to_dense(tf.sparse.reorder(update_data(X_train)))
X_val = tf.sparse.to_dense(tf.sparse.reorder(update_data(X_val)))
X_test = tf.sparse.to_dense(tf.sparse.reorder(update_data(X_test)))

In [24]:
model = tf.keras.Sequential()
model.add(layers.Embedding(input_dim=1000, output_dim=128))
model.add(layers.GRU(256, return_sequences=True))
model.add(layers.SimpleRNN(128))
model.add(layers.Dense(1))

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         128000    
_________________________________________________________________
gru (GRU)                    (None, None, 256)         296448    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128)               49280     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 473,857
Trainable params: 473,857
Non-trainable params: 0
_________________________________________________________________


In [27]:
y_train_oh = tf.keras.utils.to_categorical(y_train)
y_val_oh = tf.keras.utils.to_categorical(y_val)

In [28]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [29]:
model.fit(X_train, y_train, epochs=2, batch_size=16, validation_data=(X_val, y_val))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f550b2dcb10>