reference 
- https://nbviewer.jupyter.org/github/nekoumei/Comparison-DocClassification/blob/master/src/Classification_News.ipynb

In [6]:
import glob
import os
from collections import defaultdict
from tqdm.notebook import tqdm
import time

import lightgbm as lgb
import numpy as np
import MeCab
import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.mixture import GaussianMixture

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split

## step1. load text files

In [2]:
#preprocessing
dirlist = ["dokujo-tsushin","it-life-hack","kaden-channel","livedoor-homme",
           "movie-enter","peachy","smax","sports-watch","topic-news"]

df = pd.DataFrame(columns=["label","news"])

for i in tqdm(dirlist):
    path = "./data/"+i+"/*.txt"
    files = glob.glob(path)
    files.pop()
    for j in tqdm(files):
        f = open(j)
        data = f.read() 
        f.close()
        t = pd.Series([i,"".join(data.split("\n")[3:])],index = df.columns)
        df  = df.append(t,ignore_index=True)

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/870 [00:00<?, ?it/s]

  0%|          | 0/870 [00:00<?, ?it/s]

  0%|          | 0/864 [00:00<?, ?it/s]

  0%|          | 0/511 [00:00<?, ?it/s]

  0%|          | 0/870 [00:00<?, ?it/s]

  0%|          | 0/842 [00:00<?, ?it/s]

  0%|          | 0/870 [00:00<?, ?it/s]

  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/770 [00:00<?, ?it/s]

In [35]:
df = df.iloc[:4000]

## step2. preprocess

In [36]:
# Loop over each news article.
import re 
tokenizer =  MeCab.Tagger("-Owakati")  
text_list = []
for review in tqdm(df["news"]):
    
    try:
        # Split a review into parsed sentences.
        result = tokenizer.parse(review).replace("\u3000","").replace("\n","")
        result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
        h = result.split(" ")
        h = list(filter(("").__ne__, h))
        text_list.append(h)
    except:
        continue

  0%|          | 0/3000 [00:00<?, ?it/s]

## step3. embedding

3-1.bag-of-word

3-2.tf-idf

In [46]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=1011)

In [47]:
vectorizer = TfidfVectorizer(use_idf=True, token_pattern=u'(?u)\\b\\w+\\b',max_features=100)
X = vectorizer.fit_transform(train_df.news.values)
X = X.toarray()

In [48]:
X.shape

(2100, 100)

## step4. training

In [49]:
y = train_df.label.values

accs_dict = {}
elapsed_times_dict = {}

START_TIME = time.time()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

f1macro_list = []
models = []

params = {
    'objective': 'multiclass',
    'num_class': df.label.nunique(),
    'n_estimators': 10000,
    'random_seed': 0
}

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    
    print(f'Start: fold {i+1}')
    X_train, y_train = X[train_index, :], y[train_index]
    X_valid, y_valid = X[valid_index, :], y[valid_index]
    
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=(X_valid, y_valid),
        early_stopping_rounds=2000,
        verbose=100
    )
    
    models.append(model)
    
    #inference
    y_pred = model.predict(X_valid)
    
    #score
    accuracy = accuracy_score(y_valid, y_pred)
    f1macro = f1_score(y_valid, y_pred, average='macro')
    recall = recall_score(y_valid, y_pred, average='macro')
    precision = precision_score(y_valid, y_pred, average='macro')
    
    f1macro_list.append(f1macro)
    
    print('----------------------------')
    print(f'Accuracy  : {accuracy}')
    print(f'F1-macro  : {f1macro}')
    print(f'recall    : {recall}')
    print(f'precision : {precision}')
    print('----------------------------')

elapsed_time = time.time() - START_TIME
print(f'Elapsed time is {elapsed_time}.')

Start: fold 1
Training until validation scores don't improve for 2000 rounds
[100]	valid_0's multi_logloss: 0.431934
[200]	valid_0's multi_logloss: 0.484479
[300]	valid_0's multi_logloss: 0.497334
[400]	valid_0's multi_logloss: 0.504869
[500]	valid_0's multi_logloss: 0.511152
[600]	valid_0's multi_logloss: 0.518057
[700]	valid_0's multi_logloss: 0.520183
[800]	valid_0's multi_logloss: 0.5223
[900]	valid_0's multi_logloss: 0.524642
[1000]	valid_0's multi_logloss: 0.528015
[1100]	valid_0's multi_logloss: 0.532225
[1200]	valid_0's multi_logloss: 0.534507
[1300]	valid_0's multi_logloss: 0.536036
[1400]	valid_0's multi_logloss: 0.538532
[1500]	valid_0's multi_logloss: 0.539447
[1600]	valid_0's multi_logloss: 0.540577
[1700]	valid_0's multi_logloss: 0.542116
[1800]	valid_0's multi_logloss: 0.544422
[1900]	valid_0's multi_logloss: 0.545128
[2000]	valid_0's multi_logloss: 0.546538
Early stopping, best iteration is:
[42]	valid_0's multi_logloss: 0.362861
----------------------------
Accuracy  :

## step5. test

In [43]:
vectorizer = TfidfVectorizer(use_idf=True, token_pattern=u'(?u)\\b\\w+\\b',max_features=100)
X = vectorizer.fit_transform(test_df.news.values)
X = X.toarray()
y_true = test_df.label.values

accs_dict = {}
elapsed_times_dict = {}

START_TIME = time.time()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

f1macro_list = []

pred_list = []
for no, model in enumerate(models):
    
    y_pred = model.predict(X)
    pred_list.append(y_pred)
    
    #score
    accuracy = accuracy_score(y_true, y_pred)
    f1macro = f1_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    precision = precision_score(y_true, y_pred, average='macro')
    
    print('----------------------------')
    print('model No.%d' % no )
    print(f'Accuracy  : {accuracy}')
    print(f'F1-macro  : {f1macro}')
    print(f'recall    : {recall}')
    print(f'precision : {precision}')
    print('----------------------------')

----------------------------
model No.0
Accuracy  : 0.6222222222222222
F1-macro  : 0.5907510561317408
recall    : 0.5817493900383518
precision : 0.6225158707114151
----------------------------
----------------------------
model No.1
Accuracy  : 0.6133333333333333
F1-macro  : 0.5830891326655759
recall    : 0.5727536253084072
precision : 0.6193162045918249
----------------------------
----------------------------
model No.2
Accuracy  : 0.6455555555555555
F1-macro  : 0.6178328704204501
recall    : 0.6085748536342434
precision : 0.6559593564762994
----------------------------
----------------------------
model No.3
Accuracy  : 0.6366666666666667
F1-macro  : 0.6042497338682051
recall    : 0.5943623914133846
precision : 0.6417804040143322
----------------------------
----------------------------
model No.4
Accuracy  : 0.6088888888888889
F1-macro  : 0.5842878323205771
recall    : 0.5782797842750741
precision : 0.6120747226126307
----------------------------


In [31]:
pred_list[1]

array(['dokujo-tsushin', 'it-life-hack', 'it-life-hack', 'it-life-hack',
       'it-life-hack', 'kaden-channel', 'kaden-channel', 'it-life-hack',
       'kaden-channel', 'it-life-hack', 'it-life-hack', 'dokujo-tsushin',
       'dokujo-tsushin', 'dokujo-tsushin', 'dokujo-tsushin',
       'livedoor-homme', 'kaden-channel', 'it-life-hack', 'kaden-channel',
       'kaden-channel', 'dokujo-tsushin', 'it-life-hack',
       'dokujo-tsushin', 'kaden-channel', 'dokujo-tsushin',
       'it-life-hack', 'dokujo-tsushin', 'kaden-channel', 'it-life-hack',
       'dokujo-tsushin', 'livedoor-homme', 'it-life-hack', 'it-life-hack',
       'dokujo-tsushin', 'kaden-channel', 'dokujo-tsushin',
       'dokujo-tsushin', 'it-life-hack', 'kaden-channel', 'it-life-hack',
       'kaden-channel', 'kaden-channel', 'kaden-channel', 'it-life-hack',
       'it-life-hack', 'it-life-hack', 'dokujo-tsushin', 'it-life-hack',
       'it-life-hack', 'it-life-hack', 'livedoor-homme', 'kaden-channel',
       'it-life-hack',

In [16]:
## step6. evaluation

In [None]:

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    
    print(f'Start: fold {i+1}')
    X_train, y_train = X[train_index, :], y[train_index]
    X_valid, y_valid = X[valid_index, :], y[valid_index]
    
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=(X_valid, y_valid),
        early_stopping_rounds=2000,
        verbose=100
    )
    
    models.append(model)
    
    #inference
    y_pred = model.predict(X_valid)
    
    #score
    accuracy = accuracy_score(y_valid, y_pred)
    f1macro = f1_score(y_valid, y_pred, average='macro')
    recall = recall_score(y_valid, y_pred, average='macro')
    precision = precision_score(y_valid, y_pred, average='macro')
    
    f1macro_list.append(f1macro)
    
    print('----------------------------')
    print(f'Accuracy  : {accuracy}')
    print(f'F1-macro  : {f1macro}')
    print(f'recall    : {recall}')
    print(f'precision : {precision}')
    print('----------------------------')

elapsed_time = time.time() - START_TIME
print(f'Elapsed time is {elapsed_time}.')
