# RNN models for sentence classification 

* Simple LSTM for Sequence Classification
* LSTM For Sequence Classification With Dropout
* LSTM and Convolutional Neural Network For Sequence Classification

## load data 

In [1]:
import pickle 
with open('/Users/apple/Documents/GitHub/Argument-Scoring-System/1. pretrained_model/labeled_essay_dics.pickle', 'rb') as handle:
    label_sents = pickle.load(handle)

In [2]:
# compute and store and the {sents: label} into one dic
all_label_sents = {}
for key in label_sents.keys():
    this_essay = label_sents[key]
    for s in this_essay.keys():
        all_label_sents[s] = this_essay[s]

## data preprocessing 

In [3]:
import numpy as np
import re
import itertools
from collections import Counter

In [4]:
"""
Original taken from https://github.com/dennybritz/cnn-text-classification-tf
"""


def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [5]:
def clear_split_str(s):
    strip_s = s.strip()
    clear_s = clean_str(strip_s)
    s_text = clear_s.split(" ")
    return s_text

In [6]:
texts = []
for key in all_label_sents.keys():
    this_value = all_label_sents[key]
    clear_key = clear_split_str(key)
    texts = texts + clear_key

In [7]:
print (len(set(texts)))

7701


In [8]:
## >> ? need to choose the top 5000 words here 
# from nltk import FreqDist
# fdist1 = FreqDist(texts)
# texts = [w for w in fdist1 if fdist1[w] > 10]
# # print (texts)
# print (len(texts))

In [9]:
vocab = sorted(set(texts))
vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))

In [10]:
def encoded_txt(text):
    return np.array([vocab_to_int[c] for c in text], dtype=np.int32)

In [11]:
## >> remove low frequency words 
# def encoded_txt(text):
#     test = []
#     for item in text:
#         if item in vocab_to_int:
#             test.append(vocab_to_int[item])
#     return np.array(test)

In [12]:
# clear data and store in one dictionary
X = []
encode_X = []
y = []
for key in all_label_sents.keys():
    this_value = all_label_sents[key]
    clear_key = clear_split_str(key)
    X.append(clear_key)
    encode_X.append(encoded_txt(clear_key))
    y.append(this_value)

In [13]:
print (y[1:100])

['Empty', 'Premise', 'Claim', 'MajorClaim', 'Empty', 'Empty', 'Premise', 'Empty', 'Premise', 'Claim', 'Claim', 'Empty', 'Claim', 'Premise', 'Premise', 'Premise', 'Empty', 'Premise', 'MajorClaim', 'Claim', 'Premise', 'Claim', 'Claim', 'Claim', 'Empty', 'Premise', 'Premise', 'Empty', 'Premise', 'Claim', 'Empty', 'Premise', 'Premise', 'Premise', 'Empty', 'Claim', 'Empty', 'Empty', 'Empty', 'Empty', 'Claim', 'Premise', 'Empty', 'Premise', 'Claim', 'Empty', 'Premise', 'MajorClaim', 'Premise', 'MajorClaim', 'Empty', 'Empty', 'MajorClaim', 'MajorClaim', 'Premise', 'Premise', 'Empty', 'Empty', 'Premise', 'Empty', 'Premise', 'Premise', 'Premise', 'Empty', 'MajorClaim', 'Premise', 'Premise', 'Premise', 'Premise', 'Premise', 'MajorClaim', 'Empty', 'Premise', 'Premise', 'Empty', 'Premise', 'Premise', 'Claim', 'Premise', 'Empty', 'Premise', 'Empty', 'MajorClaim', 'Premise', 'Claim', 'Premise', 'Claim', 'Premise', 'Premise', 'Empty', 'Premise', 'Premise', 'Claim', 'Premise', 'Premise', 'MajorClaim',

In [14]:
encode_y = []
for s in y:
    if s == 'Claim':
        encode_y.append([1,0,0,0])
    elif s == 'Premise':
        encode_y.append([0,1,0,0])
    elif s == 'Empty':
        encode_y.append([0,0,1,0])
    else: 
        encode_y.append([0,0,0,1])

In [15]:
length = int(len(X) * 0.9)
X_train = encode_X[0:length]
y_train = encode_y[0:length]
X_test = encode_X[length:]
y_test = encode_y[length:]

In [16]:
print (len(X_train), len(y_train), len(X_test), len(y_test))

6369 6369 708 708


In [17]:
print (encode_X[1000], encode_y[1000])
print (X_train[1000], y_train[1000])

[3530   74 4727   19  363 2786  880 4051 1809 4519 3634 4052   19 6364  485
  830 6894 6593 3530 6894 1177  400 7493 6894 6594 6976] [0, 1, 0, 0]
[3530   74 4727   19  363 2786  880 4051 1809 4519 3634 4052   19 6364  485
  830 6894 6593 3530 6894 1177  400 7493 6894 6594 6976] [0, 1, 0, 0]


In [18]:
max_lenth = 0
for item in X:
    if len(item) > max_lenth:
        max_lenth = len(item)

print ("max sentence length is: " + str(max_lenth))

max sentence length is: 72


## Simple LSTM for Sequence Classification

In [19]:
import numpy
# from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)

Using TensorFlow backend.


In [20]:
# truncate and pad input sequences
max_sent_length = 72
X_train = sequence.pad_sequences(X_train, maxlen=max_sent_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_sent_length)

In [None]:
# load the dataset but only keep the top n words, zero the rest
# top_words = 7701
top_words = len(vocab)
# (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [None]:
# create the model
embedding_vecor_length = 32
model1 = Sequential()
model1.add(Embedding(top_words, embedding_vecor_length, input_length=max_sent_length))
model1.add(LSTM(100))
model1.add(Dense(4, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model1.summary())
model1.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 72, 32)            246432    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 404       
Total params: 300,036
Trainable params: 300,036
Non-trainable params: 0
_________________________________________________________________
None
Train on 6369 samples, validate on 708 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

In [None]:
# Final evaluation of the model
scores = model1.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

## 2. LSTM For Sequence Classification With Dropout

In [None]:
from keras.layers import Dropout
model2 = Sequential()
model2.add(Embedding(top_words, embedding_vecor_length, input_length=max_sent_length))
model2.add(Dropout(0.2))
model2.add(LSTM(100))
model2.add(Dropout(0.2))
model2.add(Dense(4, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model2.summary())
model2.fit(X_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores2 = model2.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores2[1]*100))

## LSTM and Convolutional Neural Network For Sequence Classification

In [None]:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [None]:
# create the model
embedding_vecor_length = 32
model3 = Sequential()
model3.add(Embedding(top_words, embedding_vecor_length, input_length=max_sent_length))
model3.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(LSTM(100))
model3.add(Dense(4, activation='sigmoid'))
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model3.summary())
model3.fit(X_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores3 = model3.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores3[1]*100))

## CNN model

In [None]:
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate
from keras.models import Sequential, Model

In [None]:
# Model Hyperparameters
embedding_dim = 300
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 10

# Prepossessing parameters
sequence_length = 72
max_words = 5000

In [None]:

input_shape = (sequence_length,)
model_input = Input(shape=input_shape)

In [None]:
z = Embedding(top_words, embedding_vecor_length, input_length=max_sent_length)(model_input)

In [None]:
z = Dropout(dropout_prob[0])(z)

In [None]:
# Convolutional block
conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)

In [None]:
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(4, activation="sigmoid")(z)

model4 = Model(model_input, model_output)
model4.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
model4.fit(X_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(X_test, y_test), verbose=2)

In [None]:
scores4 = model4.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores4[1]*100))

## comment data preprocessing

* clear labeled data into sentence vectors 
* clear comment data into sentence vectors

In [None]:
# build a preprocessing function 

In [None]:
def sent2vec(sent):
    text = clear_split_str(sent)
    return np.array([vocab_to_int[c] for c in text if c in vocab_to_int], dtype=np.int32)

In [None]:
def sent_labels(s):
    this_label = []
    if s == 'Claim':
        this_label = [1,0,0,0]
    elif s == 'Premise':
        this_label = [0,1,0,0]
    elif s == 'Empty':
        this_label = [0,0,1,0]
    else: 
        this_label = [0,0,0,1]
    return this_label

## load comment data 

In [None]:
# load labeled comment data
import pandas as pd

In [None]:
comment_path = 'comment_sent.csv'
label_comments_data = pd.read_csv(comment_path,encoding = "ISO-8859-1")
label_sents = label_comments_data['sentence']
label_components = label_comments_data['label']

In [None]:
path = '/Users/apple/Documents/GitHub/Argument-Scoring-System/comment_data/comments.csv'
data = pd.read_csv(path)

In [None]:
texts = data['comment_text']
comment_scores = data['mean_evaluation']

In [None]:
print (len(comment_scores))

## fit the model on the labeled data 

In [None]:
# generating vectors 
new_X = [sent2vec(item) for item in label_sents]
new_X = sequence.pad_sequences(new_X, maxlen=max_sent_length)
new_y = [sent_labels(item) for item in label_components]

In [None]:
new_X_train = new_X[0:50]
new_y_train = new_y[0:50]
new_X_test = new_X[50:]
new_y_test = new_y[50:]

## predict the component of sentences in comments 

In [None]:
# predict with the original model
# calculate the accuracy
# Final evaluation of the model
scores = model1.evaluate(new_X_test, new_y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
# predit with the fitted model 
# calculate the accuracy 
model1.fit(new_X_train, new_y_train, validation_data=(new_X_test, new_y_test), epochs=3, batch_size=64)

# Final evaluation of the model
scores = model1.evaluate(new_X_test, new_y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

## generate a vector of components for each comments

In [None]:
import operator
def get_max_index(vector):
    max_value = max(vector)
    for i in range(4):
        if max_value == vector[i]:
            return i

In [None]:
# write a function to generate the vector for each components of comments 
from nltk.tokenize import sent_tokenize
def components_text(text):
    sents = sent_tokenize(text)
    sents_vector = [sent2vec(s) for s in sents]
    sents_vector = sequence.pad_sequences(sents_vector, maxlen=max_sent_length)
#     change the deep learning model here
    components = model4.predict(sents_vector)
    return components

In [None]:
# return the argument components of a comment 
def get_components(text):
    vectors = components_text(text)
    components = [get_max_index(v) for v in vectors]
    return components

In [None]:
# return the number of words for every sentence 
def sent_words(text):
    sents = sent_tokenize(text)
    word_count = []
    for s in sents:
        word_count.append(len(s))
    return word_count

In [None]:
i = 22
# print (texts[i])
# print (comment_scores[i])
# print (components_text(texts[i]))
print (get_components(texts[i]))
print (sent_words(texts[i]))

In [None]:
# natio of claim and other components
def component_num(text):
    components = get_components(text)
    Claim_num = 0
    Premise_num = 0
    Empty_num = 0
    major_claim = 0 
    
    for item in components:
        if item == 0:
            Claim_num += 1
        elif item == 1:
            Premise_num += 1
        elif item == 2:
            Empty_num +=1 
        else:
            major_claim += 1
            
    return [Claim_num, Premise_num, Empty_num, major_claim]

In [None]:
print (component_num(texts[i]))

*** natio for components ***

In [None]:
# return the ratio for every element
def component_ratio(text):
    components = component_num(text)
    com_sum = sum(components)
    
    natio_list = []
    for item in components:
        temp = float(item)/com_sum
        natio_list.append(round(temp,2))
    return natio_list

In [None]:
print (component_ratio(texts[i]))

In [None]:
# words count of these components 
def component_word_count(text):
    # the component list and corresponding sentence word count 
    comp_list = get_components(text)
    words_num_list = sent_words(text)
    
    Claim_num = 0
    Premise_num = 0
    Empty_num = 0
    major_claim = 0 
    
    for i in range(len(comp_list)):
        temp = comp_list[i]
        if temp == 0:
            Claim_num += words_num_list[i]
        elif temp == 1:
            Premise_num += words_num_list[i]
        elif temp == 2:
            Empty_num += words_num_list[i]
        else:
            major_claim += words_num_list[i]

    return [Claim_num, Premise_num, Empty_num, major_claim]
    

In [None]:
print (component_word_count(texts[i]))

 ***natio of words number for every component***

In [None]:
# word natio for every component
def component_word_natio(text):
    components_words_list = component_word_count(text)
    word_sum = sum(components_words_list)
    
    natio_list = []
    for item in components_words_list:
        temp = float(item)/word_sum
        natio_list.append(round(temp,2))
    return natio_list

In [None]:
print (component_word_natio(texts[i]))

In [None]:

# texts = data['comment_text']
# comment_scores = data['mean_evaluation']

In [None]:
# for i in range(100):
# #     print (component_num(texts[i])
#     print (texts[i])
#     print (component_num(texts[i]))
#     print (component_ratio(texts[i]))
#     print (component_word_count(texts[i]))
#     print (component_word_natio(texts[i]))
#     print (comment_scores[i])
#     print ('-----')
#     print 

In [None]:
# store the result into a pickle file 


## visualize the number of components

In [None]:
component_nums = []
for i in range(len(texts)):
    print (i)

 ***the number of component*** VS ***score***

In [None]:
# the ratio of 