# Building a Topic Classification Model with Keras

In [136]:
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import time

In [2]:
# DEFINE DIRECTORY PATH
path_to_json = 'capstone/TrainingData2014/'

# CREATE LIST OF FILES FROM THE DIRECTORY
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

# DEFINE PANDAS DATAFRAME
df = pd.DataFrame()

# LOOP THROUGH FILES, READ IN JSON AND BUILD DATAFRAME
for index, js in enumerate(json_files):
    json_data = pd.read_json(os.path.join(path_to_json, js), 'r')
    df = df.append(json_data)

# LOOK AT TOPIC DICTIONARY AND GET A TOPIC COUNT
topic_file = open('capstone/topicDictionary.txt', 'r')
topics = topic_file.read().split('\r\n')

# SPLITTING THE ELEMENTS OF THE JSON INTO TEXT, PUBLICATION DATE AND TOPICS
df['text'] = df.TrainingData.apply(lambda x: x['bodyText'])
df['pubdate'] = df.TrainingData.apply(lambda x: x['webPublicationDate'])
df['topics'] = df.TrainingData.apply(lambda x: x['topics'])

# DROP FIRST TWO COLUMNS
df.reset_index(inplace=True, drop=True)
df.drop('TrainingData', axis=1, inplace=True)

# DEFINE FUNCTION TO CREATE OUR DATAFRAME
def topic_col(x):
    a = 0
    for elem in x:
        if elem == topic:
            a = 1
    return a

# RUN TOPIC COL FUNCTION ON ALL DATA
time1 = time.time()
for topic in topics:
    df[topic] = df['topics'].map(topic_col)
time2 = time.time()
time_in_s = (time2-time1)
print 'Function takes around %0.3f seconds to run' % (time_in_s)

X = df['text']
y = df['afghanistan']

# CREATE A TRAIN AND TEST SPLIT FOR THE DATA WITH A TEST SIZE OF 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Function takes around 30.321 seconds to run


Taking the topic of Afghanistan with approximately 400 topic labelled articles out of 111,200 articles, we will see how well our model can predict turning the text of the articles into a sparse matrix.

In [144]:
def sampler(topic, df):
    
    # SELECT INDICES OF TOPIC ARTICLES
    topicindexes = df[df[topic] == 1].index.tolist()
    
    # FIND COUNT OF TOPIC ARTICLES
    articlecount = len(topicindexes)
    
    if articlecount < 10:
        # SELECT NON TOPIC INDICES
        nontopicarticlesindexes = df[df[topic] == 0].sample(articlecount*27, random_state=42).index.tolist()
        # CREATE LIST OF COMBINED INDICES
        sampleindex = topicindexes + nontopicarticlesindexes 
        # CREATE NEW DATAFRAME
        X = df.iloc[sampleindex]['text'].reset_index(drop=True)
        y = df.iloc[sampleindex][topic].reset_index(drop=True)
        X = X.append(df.iloc[topicindexes]['text'])
        y = y.append(df.iloc[topicindexes][topic])
        X = X.append(df.iloc[topicindexes]['text'])
        y = y.append(df.iloc[topicindexes][topic])
        X = X.append(df.iloc[topicindexes]['text']).reset_index(drop=True)
        y = y.append(df.iloc[topicindexes][topic]).reset_index(drop=True)
        
    elif articlecount < 100:
        # SELECT NON TOPIC INDICES
        nontopicarticlesindexes = df[df[topic] == 0].sample(articlecount*9, random_state=42).index.tolist()
        # CREATE LIST OF COMBINED INDICES
        sampleindex = topicindexes + nontopicarticlesindexes 
        # CREATE NEW DATAFRAME
        X = df.iloc[sampleindex]['text'].reset_index(drop=True)
        y = df.iloc[sampleindex][topic].reset_index(drop=True)
        X = X.append(df.iloc[topicindexes]['text']).reset_index(drop=True)
        y = y.append(df.iloc[topicindexes][topic]).reset_index(drop=True)
        
    else:
        # SELECT NON TOPIC INDICES
        topicindexes = df[df[topic] == 1].sample(100).index.tolist()
        
        nontopicarticlesindexes = df[df[topic] == 0].sample(len(topicindexes)*9, random_state=42).index.tolist()
        nonarticlecount = len(nontopicarticlesindexes)
        # CREATE LIST OF COMBINED INDICES
        sampleindex = topicindexes + nontopicarticlesindexes
        # CREATE NEW DATAFRAME
        X = df.iloc[sampleindex]['text'].reset_index(drop=True)
        y = df.iloc[sampleindex][topic].reset_index(drop=True)
    
    return X, y

In [3]:
df['afghanistan'].value_counts()

0    110799
1       399
Name: afghanistan, dtype: int64

In [145]:
# SET X AND Y
X, y = sampler('afghanistan', df)

In [146]:
# CHECK MAIN TEXT
X.head()

0    The Taliban have released a video of the momen...
1    Two years ago, I came to New Hampshire to redi...
2    When Darwish looked out of his new living room...
3    The bus that rumbles through the Kandahar afte...
4    After an exhausting and contentious election p...
Name: text, dtype: object

In [148]:
# CHECK TARGET VALUE COUNTS
y.value_counts()

0    900
1    100
Name: afghanistan, dtype: int64

In [151]:
# CHECK TARGET
y.head()

0    1
1    1
2    1
3    1
4    1
Name: afghanistan, dtype: int64

In [165]:
# CREATE A TRAIN AND TEST SPLIT FOR THE DATA WITH A TEST SIZE OF 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Setting up the Keras Model

In [166]:
max_words = 10000
batch_size = 32
epochs = 5

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

(800, 'train sequences')
(200, 'test sequences')
(2, 'classes')
Vectorizing sequence data...
('x_train shape:', (800,))
('x_test shape:', (200,))
Convert class vector to binary class matrix (for use with categorical_crossentropy)
('y_train shape:', (800, 2))
('y_test shape:', (200, 2))
Building model...
Train on 720 samples, validate on 80 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
('Test accuracy:', 0.90000000000000002)
