In [15]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

from sklearn.manifold import TSNE
import re
from nltk.stem import SnowballStemmer

In [20]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", "  ", text)
    text = re.sub(r"\-", "  ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text 

In [21]:
import sklearn.datasets as skd

In [22]:
categories = ['0', '1']
ds = skd.load_files("./dataset/", categories=categories, encoding="UTF-8")
length = len(ds.data)

In [23]:
final_data = []
for ix in range(length):
    text_str = clean_text(ds.data[ix])
    final_data.append(text_str)

In [24]:
final_data[0]

'ex 10 iii h a exhibit 10 iii h a amend amend restat employ agreement this amend the amend made sterl bancorp the compani and loui cappelli execut effect decemb 29 2008 wherea compani execut parti amend restat employ agreement date march 22 2002 last amend march 13 2008 the agreement ; wherea compani execut desir amend certain provis agreement in order exempt compli section 409a intern revenu code 1986 as amend section 409a ; and now therefor agreement herebi amend follow : 1 section agreement herebi amend ad new section 2 c follow : c permit continu engag activ direct relat the busi compani execut permit engag prior chang in control as defin schedul hereto 2 section 4 b agreement herebi delet entireti 3 section 5 c agreement herebi replac entireti follow : c disabl event termin execut s employ due to execut s disabl compani pay execut three month execut s base salari lump sum 4 section 5 d agreement herebi replac entireti follow : d death event termin execut s employ due to execut s d

In [40]:
### Create sequence
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(final_data)
sequences = tokenizer.texts_to_sequences(final_data)
data = pad_sequences(sequences, maxlen=500)

In [45]:
model = Sequential()
model.add(Embedding(20000, 100, input_length=500))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X_train, X_test, Y_train, Y_test = train_test_split(data, ds.target, test_size=0.2)

In [48]:
model.fit(X_train, Y_train, validation_split=0.3, epochs=10)

Train on 560 samples, validate on 240 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1c95e6aa390>

In [49]:
pred = model.predict(X_test)

In [50]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(Y_test, pred.round()))

0.81
