In [1]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Activation, SimpleRNN
from sklearn import feature_extraction
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from keras import backend as K
from keras.preprocessing import sequence
from keras import regularizers, optimizers
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
import numpy as np
import csv
import matplotlib.pyplot as plt
import itertools
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from keras.utils import np_utils, to_categorical

Using TensorFlow backend.


In [2]:
fdist = FreqDist()
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

In [3]:

def load_file(filename):
    X_2 = []
    y_2 = []
    X_5 = []
    y_5 = []
    with open(filename) as f:
        for i, line in enumerate(f):
            index = 0
            indices = []
            for char in line:
                if char == ',':
                    indices.append(index)
                    break
                index += 1
            value = line[0:index]
            tag = ''

            if float(value) > 0.8:
                tag = 4
            elif float(value) > 0.6:
                tag = 3
            elif float(value) > 0.4:
                tag = 2
            elif float(value) > 0.2:
                tag = 1
            else:
                tag = 0
            y_5.append(tag)
            X_5.append(" ".join(tokenizer.tokenize(line[index + 1:])))


            if float(value) > 0.4 and float(value) <= 0.6:
                continue

            if float(value) > 0.6:
                tag = 1
            elif float(value) <= 0.4:
                tag = 0
            y_2.append(tag)
            X_2.append(" ".join(tokenizer.tokenize(line[index + 1:])))


    return X_2, X_5, y_2, y_5


In [4]:
wordVocab = {}

def createVocab():
    vocab = {}
    count = 0
    with open("sst_train_sentences.csv") as f:
        for i, line in enumerate(f):
            index = 0
            for char in line:
                if char == ',':
                    break
                index += 1
            words = tokenizer.tokenize(line[index + 1:])
            for word in words:
                word = word.lower()
                word = lemmatizer.lemmatize(word)
                fdist[word] += 1
    with open("sst_dev.csv") as f:
        for i, line in enumerate(f):
            index = 0
            for char in line:
                if char == ',':
                    break
                index += 1
            words = tokenizer.tokenize(line[index + 1:])
            for word in words:
                word = word.lower()
                word = lemmatizer.lemmatize(word)
                fdist[word] += 1
    with open("sst_test.csv") as f:
        for i, line in enumerate(f):
            index = 0
            for char in line:
                if char == ',':
                    break
                index += 1
            words = tokenizer.tokenize(line[index + 1:])
            for word in words:
                word = word.lower()
                word = lemmatizer.lemmatize(word)
                fdist[word] += 1
    file = open("wordVocab.txt", 'w')
    count = 1
    for word,value in fdist.most_common():
        file.write(str(word) + ":" + str(count) + "\n")
        count = count +1
    file.close()

def loadVocab():
    file = open("wordVocab.txt")
    for line in file:
        value = line.split(":")
        wordVocab[value[0]] = int(value[1].strip())

def sentence_to_ids(sentence):
    ids = []
    words = sentence.split(" ")
    for word in words:
        word = word.lower()
        word = lemmatizer.lemmatize(word)
        if word != "":
            ids.append(wordVocab[word])
    return ids

In [5]:
print("creating")
createVocab()
print("loading")
loadVocab()
embedding_size = 100
vocabulary_size = len(wordVocab) +1

creating
loading


In [23]:

def fit_lstm(X_train, y_train,X_valid, y_valid, X_test, y_test, batch_size,neurons, dropout, reg_value, nb_class, writer, nb_layer, rec_drop, max_length):
    print([batch_size,neurons, dropout, reg_value, nb_layer, rec_drop])
    model = Sequential()
    model.add(Embedding(vocabulary_size, embedding_size, input_length=max_length))
    neuron_nb = neurons
    for i in range(nb_layer):
        if i == nb_layer -1:
            model.add(
                LSTM(neuron_nb, return_sequences=False, dropout=rec_drop, recurrent_dropout=rec_drop))
        else:
            model.add(
                LSTM(neuron_nb, return_sequences=True, dropout=rec_drop, recurrent_dropout=rec_drop))
        neuron_nb = int(neuron_nb / 2)
        if neuron_nb == 0:
            neuron_nb = 1
            
    model.add(Dropout(dropout))
    if nb_class == 2:
        model.add(Dense(1, activation="sigmoid"))
        model.compile(loss='mean_squared_error',
                      optimizer='adam',
                      metrics=['accuracy'])
    else:
        model.add(Dense(5, activation="softmax"))
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    acc = []
    val_acc = []
    y = []
    best_valid_acc = 0
    num_under = 0
    epoch_nb = 0
    while epoch_nb < 2:
        y.append(epoch_nb)
        train_history = model.fit(X_train, y_train,validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=1, shuffle=True)
        model.reset_states()
        print("epoch " +str(epoch_nb))
        acc.append(train_history.history["acc"][0])
        scores = model.evaluate(X_test, y_test, verbose=0)
        val_acc.append(scores[1])
        valid_loss = train_history.history["val_loss"][0]
        valid_acc = train_history.history["val_acc"][0]
        print('Test accuracy:', scores[1])
        print("Test loss: ", scores[0])
        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
        writer.writerow([epoch_nb, batch_size,neurons, dropout, reg_value, rec_drop, nb_layer, value_loss, value_acc, scores[0], scores[1], max_length])
        epoch_nb += 1
        return best_valid_acc

In [17]:
## to train 2 classification


file = open("rnn_results_coarse_grained.csv", 'w')
writer= csv.writer(file)
writer.writerow(["epoch_nb", "batch_size","neurons_nb", "dropout_value", "reg_value","recurrent_reg present",  "nb_layer","current_loss","current_accuracy",
             "test_loss", "test_accuracy"])
count = 0
for batch_size in [30]:
    for dropout_value in [0.6]:
        for nb_layer in [1]:
            for reg_value in [1e-5]:
                for rec_drop in [0.2]:
                    for neuron_nb in [25]:
                        for max_length in [35]:

                            X_train_2, X_train_5, y_train_2, y_train_5 = load_file('sst_train_sentences.csv')
                            X_dev_2, X_dev_5, y_dev_2, y_dev_5 = load_file('sst_dev.csv')
                            X_test_2, X_test_5, y_test_2, y_test_5 = load_file('sst_test.csv')

                            
                            X_train_ids_2 = []
                            X_test_ids_2 = []
                            X_dev_ids_2 = []
                            X_train_ids_5 = []
                            X_test_ids_5 = []
                            X_dev_ids_5 = []

                            for i in range(len(X_train_2)):
                                X_train_ids_2.append(sentence_to_ids(X_train_2[i]))
                            for i in range(len(X_test_2)):
                                X_test_ids_2.append(sentence_to_ids(X_test_2[i]))
                            for i in range(len(X_dev_2)):
                                X_dev_ids_2.append(sentence_to_ids(X_dev_2[i]))


                            X_train_ids_2 = np.array(X_train_ids_2)
                            X_test_ids_2 = np.array(X_test_ids_2)
                            X_dev_ids_2 = np.array(X_dev_ids_2)
                            y_train_2 = np.array(y_train_2)
                            y_dev_2 = np.array(y_dev_2)
                            y_test_2 = np.array(y_test_2)

                            X_train_ids_2 = sequence.pad_sequences(X_train_ids_2, maxlen=max_length)
                            X_test_ids_2 = sequence.pad_sequences(X_test_ids_2, maxlen=max_length)
                            X_dev_ids_2 = sequence.pad_sequences(X_dev_ids_2, maxlen=max_length)


                            X_train_2 = np.empty((len(X_train_ids_2) + len(X_dev_ids_2), max_length))
                            y_train_final_2 = np.empty((len(X_train_ids_2) + len(X_dev_ids_2), 1))

                            for i in range(len(X_train_ids_2)):
                                X_train_2[i] = X_train_ids_2[i]
                                y_train_final_2[i] = y_train_2[i]
                            for i in range(len(X_dev_ids_2)):
                                X_train_2[len(X_train_ids_2) + i] = X_dev_ids_2[i]
                                y_train_final_2[len(X_train_ids_2) + i] = y_dev_2[i]


                            valid_fit_lstm(X_train_2 , y_train_final_2, X_test_ids_2, y_test_2, batch_size, neuron_nb, dropout_value, reg_value, 2, writer, nb_layer, rec_drop, max_length)
file.close()

[30, 25, 0.6, 1e-05, 1, 0.2]
Epoch 1/1
epoch 0
Test accuracy: 0.7545304776612773
Test loss:  0.16455615019320394
Epoch 1/1
epoch 1
Test accuracy: 0.8226249314218612
Test loss:  0.13415773762804278


In [None]:

file = open("rnn_results_fine_grained.csv", 'w')
writer= csv.writer(file)
writer.writerow(["epoch_nb", "batch_size","neurons_nb", "dropout_value", "reg_value","recurrent_reg present",  "nb_layer","current_loss","current_accuracy",
             "test_loss", "test_accuracy"])
count = 0
for max_length in [45]:
    X_train_2, X_train_5, y_train_2, y_train_5 = load_file('sst_train_sentences.csv')
    X_dev_2, X_dev_5, y_dev_2, y_dev_5 = load_file('sst_dev.csv')
    X_test_2, X_test_5, y_test_2, y_test_5 = load_file('sst_test.csv')

    #
    X_train_ids_2 = []
    X_test_ids_2 = []
    X_dev_ids_2 = []
    X_train_ids_5 = []
    X_test_ids_5 = []
    X_dev_ids_5 = []

    for i in range(len(X_train_5)):
        X_train_ids_5.append(sentence_to_ids(X_train_5[i]))
    for i in range(len(X_test_5)):
        X_test_ids_5.append(sentence_to_ids(X_test_5[i]))
    for i in range(len(X_dev_5)):
        X_dev_ids_5.append(sentence_to_ids(X_dev_5[i]))

    X_train_ids_5 = np.array(X_train_ids_5)
    X_test_ids_5 = np.array(X_test_ids_5)
    X_dev_ids_5 = np.array(X_dev_ids_5)
    y_train_5 = np.array(y_train_5)
    y_dev_5 = np.array(y_dev_5)
    y_test_5 = np.array(y_test_5)

    X_train_ids_5 = sequence.pad_sequences(X_train_ids_5, maxlen=max_length)
    X_test_ids_5 = sequence.pad_sequences(X_test_ids_5, maxlen=max_length)
    X_dev_ids_5 = sequence.pad_sequences(X_dev_ids_5, maxlen=max_length)

    X_train_5 = np.empty((len(X_train_ids_5) + len(X_dev_ids_5), max_length))
    y_train_final_5 = np.empty((len(X_train_ids_5) + len(X_dev_ids_5), 1))

    for i in range(len(X_train_ids_5)):
        X_train_5[i] = X_train_ids_5[i]
        y_train_final_5[i] = y_train_5[i]
    for i in range(len(X_dev_ids_5)):
        X_train_5[len(X_train_ids_5) + i] = X_dev_ids_5[i]
        y_train_final_5[len(X_train_ids_5) + i] = y_dev_5[i]

    y_train_final_5 = to_categorical(y_train_final_5)
    y_test_5 = to_categorical(y_test_5)

    for batch_size in [30]:
        for dropout_value in [0.7]:
            for nb_layer in [1]:
                for reg_value in [1e-5]:
                    for rec_drop in [0.2]:
                        for neuron_nb in [30]:
                            fit_lstm(X_train_5 , y_train_final_5, X_test_ids_5, y_test_5, batch_size, neuron_nb, dropout_value, reg_value, 5, writer, nb_layer, rec_drop, max_length)
file.close()

[30, 30, 0.7, 1e-05, 1, 0.2]
Epoch 1/1
epoch 0
Test accuracy: 0.35429864253393667
Test loss:  1.4844735269632814
Epoch 1/1
 210/9634 [..............................] - ETA: 32s - loss: 1.4702 - acc: 0.3762