Unlike English, Chinese sentences consist of characters without blanks, for example, "I came to this restaurant this afternoon"(我今天下午来到了这个饭店。) and sometimes one character can represent a term, soemtimes two or more characters represent a term together. Humans can identify those terms according to their knowledge and the context, but not for computers. In order to do Chinese document classification, we need to segment Chinese texts into terms of characters first, then extract features as in English document classification, and finally classify the texts using machine learning or deep learning methods.

## Read Data

We need to read the original Chinese texts and segment them. It is necessary to open the files with encoding mode that matches Chinese. After segmentation, we save those terms in new files. The segmentation will be done by Jieba package which makes use of Conditional Random Fields algorithms.

In [1]:
import os
import sys
import glob
import jieba
import codecs
class textReader(object):
    '''Read text files and split them into Chinese terms'''
    def __init__(self, corpus_path):
        self.corpus_path = corpus_path

    def __fetchFilenames(self):
        # Get child directories, each child directory means a class
        categories = os.listdir(self.corpus_path)
        #Fetch file names within each folder
        file_label_dict = {}
        #Fetch file names for each category
        try:
            for d in categories:
                path = self.corpus_path + '\\' + d
                file_names = glob.glob(path+'\\'+'*.txt')
                file_label_dict[d] = file_names
        except:
            print('Files Reading Error!')
        return  file_label_dict

    def fetchAllFilePaths(self):
        '''Get all the file names and their labels'''
        file_label_dict = self.__fetchFilenames()
        filepaths = []
        labels = []
        for k,v in file_label_dict.items():
            for f in v:
                labels.append(k)
                filepaths.append(f)
        return filepaths, labels
    
    def __loadChineseText(self, path):
        #Handle Chinese Characters
        with codecs.open(f, encoding='gbk', errors='ignore') as fi:
            #Load the text and remove blanks and endings
            text = fi.read()
            text = text.replace("\r\n", "") 
            text = text.replace(" ", "")
        return text
    
    def generateTextSegments(self, savePath):
        '''Create Segmented texts'''
        filepaths, labels = self.fetchAllFilePaths()
        if not os.path.exists(savePath):
            os.makedirs(savePath) 
            print('Saving Folder Created')
        #Traverse each text
        for f in filepaths:
            try:
                text = self.__loadChineseText(f)
                #Cut the text into words
                segs = jieba.cut(text)
                #Get the name of the file
                basename = os.path.basename(f)
                content = " ".join(list(segs))
                savefile = savePath + '\\' + basename
                #Save the content in a new file
                with open(savefile, "w") as fp:  
                    fp.write(content)  
            except:
                print('File:', f, 'Loading Errors!')
        print('Segmentation FInished!')


In [12]:
path_train = 'data\\train'
tr = textReader(path_train)
filenames, labels = tr.fetchAllFilePaths()
tr.generateTextSegments('Segmented\\train')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\richard\AppData\Local\Temp\jieba.cache
Loading model cost 1.268 seconds.
Prefix dict has been built succesfully.


Segmentation FInished!


In [13]:
path_test = 'data\\test'
tr = textReader(path_test)
filenames, labels = tr.fetchAllFilePaths()
tr.generateTextSegments('Segmented\\test')

Segmentation FInished!


Now we have segmented those Chinese texts into series of terms separated by blanks.

## Text Transformation

In this part, we are going to transform each term into an ID, and transform a text into a sequence of IDs.

In [5]:
import tensorflow as tf
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
class textTransform:
    '''Transfor a text into a sequence of term IDs'''
    def __init__(self, path_train, path_test, max_doc_len=300):
        self.path_train = path_train
        self.path_test = path_test
        self.max_doc_len = max_doc_len
    
    def __loadChineseText(self, path):
        #Handle Chinese Characters
        with open(path) as fi:
            #Load the text and remove blanks and endings
            text = fi.read()
            #text = text.replace("\r\n", "") 
            #text = text.replace(" ", "")
        return text
    
    def __extractLabel(self, path):
        #Get the file name
        filename = os.path.basename(path)
        #Remove the ending
        filename = filename.strip('.txt')
        #Remove digits
        filename = re.sub('[0-9]', '', filename)
        return filename
        
    def __readTextLabel(self):
        train_file_names = glob.glob(self.path_train+'\\'+'*.txt')
        test_file_names = glob.glob(self.path_test+'\\'+'*.txt')
        train_files, test_files = [], []
        train_labels, test_labels = [], []
        for f in train_file_names:
            try:
                text = self.__loadChineseText(f)
                train_files.append(text)
                #Set the folder name as category
                label = self.__extractLabel(f)
                train_labels.append(label)
            except Exception as e:
                print('Error in ', f)
                print(e)
        for f in test_file_names:
            try:
                text = self.__loadChineseText(f)
                test_files.append(text)
                #Set the folder name as category
                label = self.__extractLabel(f)
                test_labels.append(label)
            except:
                print('Error in ', f)
        return train_files, test_files, train_labels, test_labels
    
    def loadTextLabel(self):
        #Get the texts and labels
        train_files, test_files, train_labels, test_labels = self.__readTextLabel()
        #Encode the labels
        le = LabelEncoder()
        train_labels = le.fit_transform(train_labels)
        test_labels = le.transform(test_labels)
        #Save the original labels
        labels = le.classes_
        return train_files, test_files, train_labels, test_labels, labels
        
        
    def text2vec(self):
        train_files, test_files, train_labels, test_labels, _ = self.loadTextLabel()
        #Transform texts
        vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(self.max_doc_len)
        x_transform_train = vocab_processor.fit_transform(train_files)
        x_transform_test = vocab_processor.transform(test_files)
        x_train = np.array(list(x_transform_train))
        x_test = np.array(list(x_transform_test))
        #Encode the label as one-hot code
        y_train = train_labels
        y_test = test_labels
        return x_train, x_test, y_train, y_test

In [13]:
path_train_processed = '..\\data\\Segmented\\train'
path_test_processed = '..\\data\\Segmented\\test'
tt = textTransform(path_train=path_train_processed, path_test=path_test_processed)

In [14]:
x_train, x_test, y_train, y_test = tt.text2vec()

In [15]:
train_files, test_files, train_labels, test_labels, labels = tt.loadTextLabel()

## Extract features from text

Here we are going to use TfIdf method to extract features from texts, actually it is a bag-of-word model. First we build a vocabulary, then calculate TfIdf value for each word within each text. And a text can be represented as a vector of TfIdf values.

In [16]:
import numpy as np
import re
import string
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
class textHelpers:
    '''
    Clean texts and turn them into series of numbers
    Args:
    train_data
    test_data
    '''
    def __init__(self, train_data, test_data):
        self._train_data = train_data
        self._test_data = test_data
        self._preprocess()
    
    
    def _preProcessor(self, s):
        #remove punctuation
        s = re.sub('[。，！”“：；？（）【】√⊥×,-、( )［ ］]', ' ', s)
        s = re.sub('[+string.punctuation+]', ' ', s)
        #remove digits
        s = re.sub('['+string.digits+']', ' ', s)
        #remove foreign characters
        s = re.sub('[a-zA-Z]', ' ', s)
        #remove line ends
        s = re.sub('\n', ' ', s)
        #s = re.sub('', ' ', s)
        #turn to lower case
        s = s.lower()
        s = re.sub('[ ]+',' ', s)
        s = s.rstrip()
        return s
    
    def _preprocess(self):
        '''Remove punctuations'''
        train_text = self._train_data
        test_text = self._test_data
        self._train_data = [self._preProcessor(item) for item in train_text]
        self._test_data = [self._preProcessor(item) for item in test_text]
        
    def tfidf_vectorizer(self):
        ''''Vectorize texts'''
        tfidfVectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=10000,
                                         max_df=1000)
        X_train_tfidf = tfidfVectorizer.fit_transform(self._train_data)
        X_test_tfidf = tfidfVectorizer.transform(self._test_data)
        vocab_index_dict = tfidfVectorizer.vocabulary_
        return X_train_tfidf, X_test_tfidf, vocab_index_dict
    
    def tfidf_weight(self):
        '''Calculate TfIdf weights for each word within each news'''
        train_text_words, test_text_words = self._text2words()
        X_train_tfidf, X_test_tfidf, vocab_index_dict = self.tfidf_vectorizer()
        train_weights = []
        test_weights = []
        #Generate dicts for words and corresponding tfidf weights
        for i, text in enumerate(train_text_words):
            word_weight = []
            for word in text:
                try:
                    word_index = vocab_index_dict.get(word)
                    w = X_train_tfidf[i, word_index]
                    word_weight.append(w)
                except:
                    word_weight.append(0)
            train_weights.append(word_weight)
        for i, text in enumerate(test_text_words):
            word_weight = []
            for word in news:
                try:
                    word_index = vocab_index_dict.get(word)
                    w = X_test_tfidf[i, word_index]
                    word_weight.append(w)
                except:
                    word_weight.append(0)
            test_weights.append(word_weight)      
        return train_weights, test_weights
    
    def _text2words(self):
        #Split each news into words
        train_text_words = []
        test_text_words = []
        for text in self._train_data:
           #Collect words for each news
           train_text_words.append(text.split())
        for text in self._test_data:
            test_text_words.append(text.split())
        return train_text_words, test_text_words
    
    def buildVocab(self):
        words = []
        for text in self._train_data:
           #Collect all the chars
           words.extend(text.split())
        #Calculate frequencies of each character
        word_freq = Counter(words)
        #Filter out those low frequency characters
        vocab = [u for u,v in word_freq.items() if v>3]
        if 'UNK' not in vocab:
            vocab.append('UNK')
        #Map each char into an ID
        word_id_map = dict(zip(vocab, range(len(vocab))))
        #Map each ID into a word
        id_word_map = dict(zip(word_id_map.values(), word_id_map.keys()))
        return vocab, word_id_map, id_word_map
    
    def text2vecs(self):
        #Map each word into an ID
        train_text_words, test_text_words = self._text2words()
        vocab, word_id_map, id_word_mapp = self.buildVocab()
        def word2id(c):
            try:
               ID = word_id_map[c]
            except:#Trun those less frequent words into UNK
               ID = word_id_map['UNK']
            return ID
        #Turn each news into a list of word Ids
        words_vecs = lambda words: [word2id(w) for w in words]
        train_text_vecs = [words_vecs(words) for words in train_text_words]
        test_text_vecs = [words_vecs(words) for words in test_text_words]
        return train_text_vecs, test_text_vecs

In [17]:
th = textHelpers(train_files, test_files)

In [18]:
X_train_tfidf, X_test_tfidf, vocab_index_dict = th.tfidf_vectorizer()

## Classify the texts

We can use some classical and simple models initially, then we can turn to some complex models and tune parameters to optimize the performance of classification

In [29]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
nb = MultinomialNB(0.001)
nb.fit(X_train_tfidf, train_labels)
preds = nb.predict(X_test_tfidf)
micro_f1 = f1_score(test_labels, preds, average='micro')

In [30]:
print('Multinomial Naive Bayesian Accuracy:{:.3f}'.format(micro_f1))

Multinomial Naive Bayesian Accuracy:0.873


In [35]:
from sklearn.ensemble import RandomForestClassifier
rf =  RandomForestClassifier(n_estimators=100, n_jobs=4)
rf.fit(X_train_tfidf, train_labels)
preds = rf.predict(X_test_tfidf)
micro_f1 = f1_score(test_labels, preds, average='micro')

In [36]:
print('Random Forest Accuracy:{:.3f}'.format(micro_f1))

Random Forest Accuracy:0.883


In [32]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-5, random_state=1,
                    learning_rate_init=.17)

In [33]:
mlp = mlp.fit(X_train_tfidf, train_labels)

Iteration 1, loss = 2.32788109
Iteration 2, loss = 1.38315838
Iteration 3, loss = 0.77474985
Iteration 4, loss = 0.58069145
Iteration 5, loss = 0.48653017
Iteration 6, loss = 0.41358977
Iteration 7, loss = 0.37262386
Iteration 8, loss = 0.32696181
Iteration 9, loss = 0.29636267
Iteration 10, loss = 0.25800192
Iteration 11, loss = 0.23235325
Iteration 12, loss = 0.21063823
Iteration 13, loss = 0.19426976
Iteration 14, loss = 0.18600798
Iteration 15, loss = 0.15733720
Iteration 16, loss = 0.14324843
Iteration 17, loss = 0.13656902
Iteration 18, loss = 0.11369000
Iteration 19, loss = 0.10470720
Iteration 20, loss = 0.09336378
Iteration 21, loss = 0.08530211
Iteration 22, loss = 0.07750077
Iteration 23, loss = 0.07104272
Iteration 24, loss = 0.06459405
Iteration 25, loss = 0.05928299
Iteration 26, loss = 0.05535413
Iteration 27, loss = 0.05212712
Iteration 28, loss = 0.04909783
Iteration 29, loss = 0.04278151
Iteration 30, loss = 0.03941097
Iteration 31, loss = 0.03855078
Iteration 32, los

In [34]:
preds = mlp.predict(X_test_tfidf)
micro_f1 = f1_score(test_labels, preds, average='micro')
print('Multi Linear Perceptron Accuracy:{:.3f}'.format(micro_f1))

Multi Linear Perceptron Accuracy:0.908
