In [1]:
import pandas as pd
import csv
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from nltk import word_tokenize     
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.neural_network import MLPClassifier
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix, f1_score
import matplotlib.pyplot as plt
import keras.backend
from keras.layers import Dropout
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, jaccard_similarity_score, accuracy_score, \
		f1_score, precision_score, recall_score

Using TensorFlow backend.


In [34]:
class Metrics(Callback):
    def __init__(self, x_test, y_test):
        self.x_test = x_test
        self.y_test = y_test

    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
        self.val_accuracies = []
        self.val_jaccards = []

    def on_batch_end(self, batch, logs={}):
        print("finished batch {}".format(batch))

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.x_test)))

        print(val_predict)

        val_predict[val_predict >= 0.5] = 1
        val_predict[val_predict < 0.5] = 0
        print(self.y_test.columns)
        print("*****")
        print(val_predict)
        print("*****")
        print(self.y_test.values)
        print("*****")
        _val_f1 = f1_score(self.y_test.values, val_predict, average='weighted')
        _val_recall = recall_score(self.y_test.values, val_predict, average='weighted')
        _val_precision = precision_score(self.y_test.values, val_predict, average='weighted')
        _val_accuracy = accuracy_score(self.y_test.values, val_predict)
        _val_jaccard = jaccard_similarity_score(self.y_test.values, val_predict)

        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        self.val_accuracies.append(_val_accuracy)
        self.val_jaccards.append(_val_jaccard)

        print("— val_f1: {} — val_precision: {} — val_recall {} — val_accuracy: {} — val_jaccard {}" \
                .format(_val_f1, _val_precision, _val_recall, _val_accuracy, _val_jaccard))

        return

In [35]:
def load_data(subset = None):
    data = pd.read_csv("./dataset_20000.csv")
    if subset is not None:
        return data.head(subset)
    return data

In [37]:
def build_rnn(num_words, input_length):
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers import LSTM
    from keras.layers.embeddings import Embedding
    from keras.preprocessing import sequence

    np.random.seed(7)

    embed_dim = 128
    lstm_out = 600

    model = Sequential()
    model.add(Embedding(num_words, embed_dim, input_length=input_length))
    model.add(Dropout(0.2))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(14, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [38]:
word_dict = set()
def remove_stopwords(sent):
	tokens = nltk.word_tokenize(sent)
	new_sent = []

	for token in tokens:
		word = token.lower()
		if word not in stop_words and len(token) > 2:
			if word not in word_dict:
				word_dict.add(word)

			new_sent.append(word)

	return new_sent

In [39]:
stop_words = set(stopwords.words('english'))

In [None]:
dataset = load_data(subset = 500)
x = dataset["summary"]
y = dataset.drop(["summary"], axis=1)

In [45]:
X = [remove_stopwords(sent) for sent in x.values]
tokenizer = Tokenizer(filters='"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
# X = tokenizer.sequences_to_matrix(X, mode='tfidf')
X = pad_sequences(X, padding='post')

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
print(X_train)

[[ 1808  1146  2380 ...     0     0     0]
 [  141 12561  2105 ...     0     0     0]
 [   66  2641  1493 ...     0     0     0]
 ...
 [ 3436    11   444 ...     0     0     0]
 [  788  5533  2454 ...     0     0     0]
 [48906   550   697 ...     0     0     0]]


In [47]:
metrics = Metrics(X_test, Y_test)
model = build_rnn(len(word_dict) + 1, X.shape[1])

In [None]:
h = model.fit(X_train, Y_train, batch_size=32, epochs=10, verbose=5, callbacks=[metrics])

Epoch 1/10
finished batch 0
finished batch 1
finished batch 2
finished batch 3
finished batch 4
finished batch 5
finished batch 6
finished batch 7
finished batch 8
finished batch 9
finished batch 10
finished batch 11
finished batch 12
finished batch 13
finished batch 14
finished batch 15
finished batch 16
finished batch 17
finished batch 18
finished batch 19
finished batch 20
finished batch 21
finished batch 22
finished batch 23
finished batch 24
finished batch 25
finished batch 26
finished batch 27
finished batch 28
finished batch 29
finished batch 30
finished batch 31
finished batch 32
finished batch 33
finished batch 34
finished batch 35
finished batch 36
finished batch 37


In [None]:
plt.plot(h.history['acc'])

print(model.summary())

score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size= 32)

print(score)
print(acc)