In [1]:
import pandas as pd
import nltk.data
from bs4 import BeautifulSoup  
import re
from nltk.corpus import stopwords
import numpy as np
from gensim.models import word2vec
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
from sklearn.ensemble import RandomForestClassifier

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
from keras.models import Model
from keras.utils.np_utils import to_categorical

import os

from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [2]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [3]:
train = pd.read_csv( "stories.tsv", header=0, delimiter="\t", quoting=3 )
train = train[train['summary'].notnull()]
train['categories'].fillna('None', inplace=True)
le = LabelEncoder()
train['enc_category'] = le.fit_transform(train['categories'])
len(train['categories'].unique())

25

In [4]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

texts = []
labels = []
classes = []


In [5]:
for index, row in train.iterrows():
    w = BeautifulSoup(row['summary'], "lxml")
    texts.append(clean_str(w.get_text()))
    labels.append(row['enc_category'])
    classes.append(row['categories'])
    
labels = to_categorical(np.asarray(labels))

In [6]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)



Found 15075 unique tokens.


In [7]:
labels = np.array(labels)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])


x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print(x_train.shape)
print(y_train.shape)

(3880, 1000)
(3880, 25)


In [8]:
GLOVE_DIR = ""
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [9]:
print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

Total 400000 word vectors in Glove 6B 100d.


In [10]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [11]:
convs = []
filter_sizes = [3,4,5]

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

for fsz in filter_sizes:
    l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
    l_pool = MaxPooling1D(5)(l_conv)
    convs.append(l_pool)
    
l_merge = Merge(mode='concat', concat_axis=1)(convs)
l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(30)(l_cov2)
l_drop = Dropout((0.25))(l_pool2)
l_flat = Flatten()(l_drop)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(25, activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])




In [12]:
print("model fitting - simplified convolutional neural network")
model.summary()
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=20, batch_size=50)

model fitting - simplified convolutional neural network
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1000, 100)     1507600     input_1[0][0]                    
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 998, 128)      38528       embedding_1[0][0]                
____________________________________________________________________________________________________
conv1d_2 (Conv1D)                (None, 997, 128)      51328       embedding_1[0][0]                
___________________________________



Train on 3880 samples, validate on 969 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1422ed748>

In [13]:
raw_text = "A breakdown of US Customs and Borders Protection computer systems caused chaos at airports around the United States on January 1."
raw_text2 = "One tough 12-year-old girl braved icy cold conditions to participate in her local polar plunge on January 1 in Hillsboro, Virginia."

In [14]:

input = tokenizer.texts_to_sequences([raw_text])
input2 = pad_sequences(input, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(input2)

In [15]:
print("%s cat; %f%% confidence" % (classes[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))

weather cat; 99.977869% confidence


In [16]:
input3 = tokenizer.texts_to_sequences([raw_text2])
input4 = pad_sequences(input3, maxlen=MAX_SEQUENCE_LENGTH)
pred2 = model.predict(input4)
print("%s cat; %f%% confidence" % (classes[np.argmax(pred2)], pred2[0][np.argmax(pred2)] * 100))

unrest, conflicts and war cat; 99.998641% confidence
