In [1]:
from keras.layers import Dense, Dropout, SpatialDropout1D
from keras.layers.convolutional import Conv1D
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalMaxPooling1D
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import collections
import matplotlib.pyplot as plt
import nltk
import numpy as np

import pandas as pd
np.random.seed(42)

Using TensorFlow backend.


In [2]:
news = pd.read_csv("uci-news-aggregator.csv")

In [5]:
dfnews=news[['CATEGORY','TITLE']].copy()
# change categories to numbers
dfnews.loc[dfnews["CATEGORY"] == "e", "CATEGORY"] = 0
dfnews.loc[dfnews["CATEGORY"] == "b", "CATEGORY"] = 1
dfnews.loc[dfnews["CATEGORY"] == "t", "CATEGORY"] = 2
dfnews.loc[dfnews["CATEGORY"] == "m", "CATEGORY"] = 3

In [6]:
# news.head()

In [7]:
counter = collections.Counter()
maxlen = 0
for i in range(0, len(dfnews)):
  
    sent = dfnews.iloc[i]['TITLE']
    words = [x.lower() for x in nltk.word_tokenize(sent)]
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
            counter[word] += 1


In [9]:
VOCAB_SIZE=10000

In [10]:
word2index = collections.defaultdict(int)
for wid, word in enumerate(counter.most_common(VOCAB_SIZE)):
    word2index[word[0]] = wid + 1
vocab_sz = len(word2index) + 1
index2word = {v:k for k, v in word2index.items()}

In [11]:
#  Save word2index to Pickle file
import pickle
# write python dict to a file
output = open('word2index.pkl', 'wb')
pickle.dump(word2index, output)
output.close()

In [12]:
# load word2index Pickle
pkl_file = open('word2index.pkl', 'rb')
word2index = pickle.load(pkl_file)
pkl_file.close()

In [13]:
# prepare data ...... for the CNN  model

xs, ys = [], []
# with open(INPUT_FILE, "r", encoding='utf-8') as f:
#     for line in f:
for i in range(0, len(dfnews)):
#         label, sent = line.strip().split("\t")
    sent = dfnews.iloc[i]['TITLE']
    label=dfnews.iloc[i]['CATEGORY']
    ys.append(int(label))
    words = [x.lower() for x in nltk.word_tokenize(sent)]
    wids = [word2index[word] for word in words]
    xs.append(wids)

In [14]:
print (maxlen)
# set maxlen 
maxlen=64

2422


In [15]:
X = pad_sequences(xs, maxlen=maxlen)
y = np_utils.to_categorical(ys, num_classes=4)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(295693, 64) (126726, 64) (295693, 4) (126726, 4)


### CNN model

In [16]:
# Model paameters
EMBED_SIZE=100
NUM_WORDS=5
NUM_FILTERS=256
NUM_EPOCHS=1
BATCH_SIZE=300


In [17]:
model = Sequential()
model.add(Embedding(vocab_sz, EMBED_SIZE, input_length=maxlen))
#model.add(SpatialDropout1D(Dropout(0.2))) #---- not supported in keras 2
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(filters=NUM_FILTERS, kernel_size=NUM_WORDS, activation="relu"))
model.add(GlobalMaxPooling1D())
model.add(Dense(4, activation="softmax"))

In [22]:
# X_train, X_test, y_train, y_test
model.compile(optimizer="adam", loss="binary_crossentropy",
              metrics=["accuracy"])
history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, verbose=True,
                    epochs=NUM_EPOCHS,
                    validation_data=(X_test, y_test))  

Train on 295693 samples, validate on 126726 samples
Epoch 1/1


In [23]:
# evaluate model
score = model.evaluate(X_test, y_test, verbose=True ) # verbose=1 to see output
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))



In [26]:
# save the model
# model.save('news_cnn.mdl',overwrite=True,include_optimizer=True)

In [25]:
# load the model
from keras.models import load_model
mymodel =  load_model('news_cnn.mdl') 