# A Binary Book Title Classifier
## Based on LSTM Neural Network

In [1]:
from keras.models import Sequential
from keras import metrics
from keras.layers import *
from keras.optimizers import *
from keras import callbacks as cbk

import pandas as pd
import numpy as np

import collections

Using TensorFlow backend.


In [2]:
# Read the dataset into a pandas dataframe
df_raw = pd.read_csv('./dataset_full.csv',encoding='utf-8',header=0)

In [3]:
# Preview the data
print(df_raw[:5])
sample_n = len(df_raw)

     features CLASS_NO1
0       海洋与海岸  sci_tech
1   头痛151个怎么办  sci_tech
2       丘陵与平原  sci_tech
3  颈椎病132个怎么办  sci_tech
4       山地与高原  sci_tech


In [4]:
# Function to drop Arabic numbers, English letters and punctuations
def drop_noise(x):
    noise = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '，', '：', '。', '“', '”', '１','２','３','４','５','６','７','８','９','０', ':', ' ', '的']
    result = x.copy()
    for i, ii in enumerate(x):
        for j, jj in enumerate(noise):
            if jj == ii:
                result.remove(ii)
                
    return result

In [5]:
# Drop noises and convert string into a list
listizer = lambda x: list(x)
setizer = lambda x: set(x)

df_list=df_raw.copy()
df_list['features'] = df_list['features'].apply(listizer)
df_list['features'] = df_list['features'].apply(drop_noise)

In [6]:
# Preview the result
df_list[:5]

Unnamed: 0,features,CLASS_NO1
0,"[海, 洋, 与, 海, 岸]",sci_tech
1,"[头, 痛, 个, 怎, 么, 办]",sci_tech
2,"[丘, 陵, 与, 平, 原]",sci_tech
3,"[颈, 椎, 病, 个, 怎, 么, 办]",sci_tech
4,"[山, 地, 与, 高, 原]",sci_tech


Bag of words

In [7]:
# This Cell create the word to index dictionary

vocab = []
for words in df_list['features']:
    for word in words:
        vocab.append(word)
vocab_list = vocab
vocab = set(vocab)
#print(vocab)
freq_dict = collections.Counter(vocab_list)

# Create dict
word2index = {}
stopwords = []
ii = 1
for i,word in enumerate(vocab):
    if freq_dict[word] >= 10:
        # This if-else statement drop the least frequent words from the word2index dictionary
        word2index[word] = ii
        ii += 1
    else: 
        stopwords.append(word)

#print(len(word2index))
#print(stopwords[:10])

In [8]:
# This function will drop the least frequent words from the dataset

def drop_stop_words(x):
    result = x.copy()
    for i, ii in enumerate(x):
        for j, jj in enumerate(stopwords):
            if jj == ii:
                result.remove(ii)
    return result

In [9]:
# Use the "drop_stop_words()" function to drop least frequent words

df_list['features'][:10000] = df_list['features'][:10000].apply(drop_stop_words)
print(1)
df_list['features'][10000:20000] = df_list['features'][10000:20000].apply(drop_stop_words)
print(2)
df_list['features'][20000:30000] = df_list['features'][20000:30000].apply(drop_stop_words)
print(3)
df_list['features'][30000:40000] = df_list['features'][30000:40000].apply(drop_stop_words)
print(4)
df_list['features'][40000:50000] = df_list['features'][40000:50000].apply(drop_stop_words)
print(5)
df_list['features'][50000:60000] = df_list['features'][50000:60000].apply(drop_stop_words)
print(6)
df_list['features'][60000:] = df_list['features'][60000:].apply(drop_stop_words)

1
2
3
4
5
6


In [10]:
# Re-create the vocabulary list, which does not contain the least frequent words
vocab = []
for words in df_list['features']:
    for word in words:
        vocab.append(word)
vocab_list = vocab
vocab = set(vocab)
freq_dict = collections.Counter(vocab_list)

In [11]:
# Another preview
df_list[:10]

Unnamed: 0,features,CLASS_NO1
0,"[海, 洋, 与, 海, 岸]",sci_tech
1,"[头, 痛, 个, 怎, 么, 办]",sci_tech
2,"[丘, 陵, 与, 平, 原]",sci_tech
3,"[颈, 病, 个, 怎, 么, 办]",sci_tech
4,"[山, 地, 与, 高, 原]",sci_tech
5,"[贫, 血, 个, 怎, 么, 办]",sci_tech
6,"[地, 球, 变, 动]",sci_tech
7,"[地, 心, 游, 记]",lib_arts
8,"[柳, 林, 风, 声]",lib_arts
9,"[极, 地, 与, 沙, 漠]",sci_tech


In [12]:
# Some exploration of the dataset

freq_dict = collections.Counter(vocab_list)
print(freq_dict.most_common(50))

print(len(word2index))

[('国', 12641), ('中', 10596), ('文', 8114), ('学', 7149), ('与', 6941), ('研', 5438), ('究', 5271), ('人', 4390), ('大', 4217), ('集', 4166), ('史', 4129), ('代', 4037), ('民', 3741), ('年', 3433), ('生', 3310), ('法', 3060), ('理', 2989), ('化', 2968), ('论', 2851), ('一', 2848), ('全', 2797), ('编', 2783), ('书', 2697), ('家', 2682), ('事', 2633), ('新', 2616), ('小', 2406), ('经', 2377), ('本', 2364), ('世', 2352), ('物', 2174), ('古', 2155), ('会', 2096), ('地', 2088), ('之', 1976), ('发', 1967), ('教', 1925), ('传', 1911), ('战', 1892), ('业', 1854), ('我', 1837), ('子', 1827), ('和', 1823), ('术', 1815), ('图', 1790), ('四', 1782), ('美', 1768), ('界', 1738), ('记', 1725), ('实', 1703)]
2472


In [13]:
# More exploration
# Find out the leaset common words, which are not dropped and has a frequecy above 10

from operator import itemgetter
import heapq
import collections
def least_common_values(array, to_find=None):
    counter = collections.Counter(array)
    if to_find is None:
        return sorted(counter.items(), key=itemgetter(1), reverse=False)
    return heapq.nsmallest(to_find, counter.items(), key=itemgetter(1))

In [14]:
# A preview
least_common_values(freq_dict, 4)

[('劝', 10), ('叽', 10), ('〇', 10), ('钻', 10)]

In [15]:
# Create the index to words dictioanry
index2word = {ii: word for word, ii in word2index.items()}

In [16]:
# A preview
word2index["波"]

803

In [17]:
# Calculate the vocabulary size
vocab_size = len(word2index)

# Maximum length of a feature vector
max_len = 20

# Create a 2d zero vector for later use
features = np.zeros(shape=(sample_n, max_len), dtype=np.int32)

In [18]:
# Convert words to feature vectors
for ii, row in enumerate(df_list['features']):
    for jj, word in enumerate(row):
        if len(row) <= max_len:
            len_row = max_len
        else:
            len_row = len(row)
        
        try:
            features[ii][len_row - jj] = word2index[word]
        except:
            pass
    if ii % 20000 == 0:  # For preview only
        print(ii/sample_n*100)
        print(features.shape)

0.0
(66431, 20)
30.106426216675953
(66431, 20)
60.212852433351905
(66431, 20)
90.31927865002784
(66431, 20)


In [19]:
# Another preview
features[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0, 2211,  589, 2148, 1343, 1571])

In [20]:
# One hot encode the label starts here

# This cell removes some irregular and mistaken date
df_label=df_list.copy()
def NAN_remover(x):
    if str(x) != 'nan':
        return str(x)
    else:
        return 'lib_arts'

df_label['CLASS_NO1'] = df_label['CLASS_NO1'].apply(NAN_remover)

In [21]:
# Another preview

df_label=df_label.drop('features', axis=1)
df_label[:5]

Unnamed: 0,CLASS_NO1
0,sci_tech
1,sci_tech
2,sci_tech
3,sci_tech
4,sci_tech


In [22]:
# Show all the labels 

np_label = np.array(df_label['CLASS_NO1'])

set_label = set(np_label)
set_label

{'lib_arts', 'sci_tech'}

In [23]:
# Define a number code for each class
num2class = {'1': 'lib_arts',
'2': 'sci_tech'}

In [24]:
# Inverse that dictionary
class2num = {ii: word for word, ii in num2class.items()}
class2num

{'lib_arts': '1', 'sci_tech': '2'}

In [25]:
def class2ohfunc(classno):
    oh = [0,0]
    oh[int(class2num[classno])-1] = 1
    return np.array(oh)
# One hot encoding for labels ends here

In [26]:
# A preview
df_label['CLASS_NO1'] = df_label['CLASS_NO1'].apply(class2ohfunc)
df_label[:3]

Unnamed: 0,CLASS_NO1
0,"[0, 1]"
1,"[0, 1]"
2,"[0, 1]"


In [27]:
# Preview the shape of features
features.shape
print(vocab_size)

2472


In [28]:
# Devide the dataset into training and testing sets 
# Shuffle is not applied, since the original dataset is unordered
labels = np.array(df_label['CLASS_NO1'])
labels = np.array(labels.tolist())
trainX = features[:-6124]
trainY = labels[:-6124]
testX = features[-6124:]

testY = labels[-6124:]

In [29]:
# A preview
print(testY[0:2])

[[1 0]
 [1 0]]


In [30]:
# Reshape the label
trainY = np.reshape(trainY, newshape=(len(trainX),2))
testY = np.reshape(testY, newshape=(len(testX),2))

In [31]:
# Another preview
trainX.shape, trainY.shape

((60307, 20), (60307, 2))

TRAINING SOON STARTS

In [32]:
# More previews to check the status
print(features.shape)
print(vocab_size)

(66431, 20)
2472


In [33]:
# Build the model 
model = Sequential()

model.add(Embedding(vocab_size, 400, input_length=max_len))

model.add(LSTM(500))
model.add(Dropout(0.5))

#model.add(Dense(600, activation='relu'))
#model.add(Dropout(0.5))

model.add(Dense(120, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(2))
model.add(Activation('softmax'))

adam_op = Adam(lr=0.004)
#adam_op = Adam()
#sgd_op = SGD(lr=0.06, decay=8e-6, momentum=1.0, nesterov=True)

model.compile(loss='categorical_crossentropy', optimizer=adam_op, metrics=[metrics.categorical_accuracy])

In [34]:
# Training starts
model.fit(trainX, trainY, batch_size=512, epochs=11, verbose=1, callbacks=[cbk.TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_grads=True, write_images=True)])

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


<keras.callbacks.History at 0x1f256987828>

In [94]:
# Training-set accuracy
score = model.evaluate(trainX, trainY)
print("")
print(score[1])

# 5 epochs: 0.98365032251645745

0.988409305719


In [95]:
# Testing-set accuracy
score = model.evaluate(testX, testY)
print("")
print(score[1])

# 5 epochs: 0.9301110387315632
# another: 0.933050293926

0.918517308948


In [96]:
# Save the model
model.save("Final_Model.h5")

In [97]:
# Prediction starts here

pred_list = np.zeros(shape=(len(testX),2))
results = model.predict(testX)
for i, res in enumerate(results):
    result = res
    if i == 1:
        print(res) # Show the first prediction result
    for jj in range(2):
        pred_list[i][jj] = result[jj]

[  1.00000000e+00   1.93321442e-10]


In [98]:
# A preview
pred_list[:10]

array([[  9.99984741e-01,   1.52766697e-05],
       [  1.00000000e+00,   1.93321442e-10],
       [  1.00000000e+00,   1.93321442e-10],
       [  1.00000000e+00,   1.93321442e-10],
       [  1.00000000e+00,   1.93321442e-10],
       [  1.00000000e+00,   1.93321442e-10],
       [  1.00000000e+00,   1.93321442e-10],
       [  9.99956012e-01,   4.40421172e-05],
       [  9.99988675e-01,   1.13214219e-05],
       [  1.00000000e+00,   3.56028949e-08]])

In [100]:
# This cell maps the results to final classes

def toClassNo(npArray):
    i = 0
    val = npArray[0]
    for jj in range(len(npArray)-1):
        if npArray[jj+1] > val:
            val = npArray[jj+1]
            i = jj + 1
    return num2class[str(i+1)]

pred_result_list = [None] * len(testX)
for i, result in enumerate(pred_list):
    pred_result_list[i] = toClassNo(result)

print(pred_result_list[:5])
len(pred_result_list)

['lib_arts', 'lib_arts', 'lib_arts', 'lib_arts', 'lib_arts']


6124

In [101]:
# One more preview
df_raw[-6124:-6124+5]

Unnamed: 0,features,CLASS_NO1
60307,柏杨全集,lib_arts
60308,四部叢刊 :四編,lib_arts
60309,四部叢刊 :四編,lib_arts
60310,四部叢刊 :四編,lib_arts
60311,四部叢刊 :四編,lib_arts


In [103]:
# Output the testing data

df_pred = pd.DataFrame(pred_result_list)
df_pred.to_csv('output1.csv')
output.to_csv('output2.csv')