In [1]:
import pandas
from janome.tokenizer import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

In [2]:
# データセット自体の再配布は禁止なので、Gitコミット時に出力を消す
def no_redistribution(data):
    pass
    # return data

## 学習コードの実装

In [3]:
def load_dataset(dataset_tsv_path):
    df = df = pandas.read_table(dataset_tsv_path)
    df = df.dropna(subset=['label'])
    print('length: ', len(df))
    return df['text'].values, df['label'].values

In [4]:
no_redistribution(load_dataset('/app/data/purin.tsv'))

length:  4125


In [5]:
t = Tokenizer()
def tokenize(text):
    return t.tokenize(text, wakati=True)

list(tokenize("今日のご飯は焼肉です"))

['今日', 'の', 'ご飯', 'は', '焼肉', 'です']

In [6]:
from numpy import count_nonzero


class Trainer():
    def __init__(self, x_train, y_train, x_test, y_test,
        vectorizer=CountVectorizer()) -> None:
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.vectorizer = vectorizer

    def train(self):
        x_train_vec = self.vectorizer.fit_transform(self.x_train)
        # 不均衡データセットなので重みをつける
        weights = {
            0: 1 / count_nonzero(y_train == 0),
            1: 1 / count_nonzero(y_train == 1),
        }
        self.model = LogisticRegression(solver='liblinear', class_weight=weights)
        self.model.fit(x_train_vec, self.y_train)
        return self.model

    def predict(self, x, proba = True):
        x_vec = self.vectorizer.transform(x)
        if proba:
            predictor = self.model.predict_proba
        else:
            predictor = self.model.predict
        return predictor(x_vec)

    def eval_and_print(self):
        y_pred = self.predict(self.x_test, proba=False)
        print('roc_auc_score (macro average)', roc_auc_score(self.y_test, y_pred, average='macro'))
        tn, fp, fn, tp = confusion_matrix(self.y_test, y_pred).ravel()
        print('tn, fp, fn, tp', tn, fp, fn, tp)
        print(classification_report(self.y_test, y_pred))
    
    def trailn_and_eval(self):
        self.train()
        self.eval_and_print()


In [7]:
x, y = load_dataset('/app/data/purin.tsv')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
trainer = Trainer(x_train, y_train, x_test, y_test, vectorizer=CountVectorizer(tokenizer=tokenize))

model = trainer.train()

length:  4125


In [8]:
trainer.eval_and_print()

roc_auc_score (macro average) 0.7891548948751643
tn, fp, fn, tp 54 10 202 559
              precision    recall  f1-score   support

           0       0.21      0.84      0.34        64
           1       0.98      0.73      0.84       761

    accuracy                           0.74       825
   macro avg       0.60      0.79      0.59       825
weighted avg       0.92      0.74      0.80       825



In [9]:
trainer.predict(['ポケモンのプリンとニャースとイーブイかわいい'])

array([[0.46502289, 0.53497711]])

In [10]:
trainer.predict(['セブンのスイーツの焼きプリンおいしい'])

array([[0.50582301, 0.49417699]])

## 前処理の比較

In [11]:
Trainer(x_train, y_train, x_test, y_test, vectorizer=CountVectorizer(tokenizer=tokenize)).trailn_and_eval()

roc_auc_score (macro average) 0.7891548948751643
tn, fp, fn, tp 54 10 202 559
              precision    recall  f1-score   support

           0       0.21      0.84      0.34        64
           1       0.98      0.73      0.84       761

    accuracy                           0.74       825
   macro avg       0.60      0.79      0.59       825
weighted avg       0.92      0.74      0.80       825



### Twitter特有表現の除去

In [12]:
import re
def remove_anchor(text):
    return re.sub(r'\@[a-zA-Z0-9]+', '', text)

Trainer(x_train, y_train, x_test, y_test,
    vectorizer=CountVectorizer(tokenizer=tokenize, preprocessor=remove_anchor)).trailn_and_eval()

roc_auc_score (macro average) 0.7977681504599212
tn, fp, fn, tp 53 11 177 584
              precision    recall  f1-score   support

           0       0.23      0.83      0.36        64
           1       0.98      0.77      0.86       761

    accuracy                           0.77       825
   macro avg       0.61      0.80      0.61       825
weighted avg       0.92      0.77      0.82       825



### Min df

In [13]:
# minimum document frequency. see: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
Trainer(x_train, y_train, x_test, y_test,
    vectorizer=CountVectorizer(tokenizer=tokenize, min_df=3)).trailn_and_eval()

roc_auc_score (macro average) 0.785869743758213
tn, fp, fn, tp 54 10 207 554
              precision    recall  f1-score   support

           0       0.21      0.84      0.33        64
           1       0.98      0.73      0.84       761

    accuracy                           0.74       825
   macro avg       0.59      0.79      0.58       825
weighted avg       0.92      0.74      0.80       825



### Max df

In [22]:
Trainer(x_train, y_train, x_test, y_test,
    vectorizer=CountVectorizer(tokenizer=tokenize, max_df=0.7)).trailn_and_eval()

roc_auc_score (macro average) 0.800108820630749
tn, fp, fn, tp 57 7 221 540
              precision    recall  f1-score   support

           0       0.21      0.89      0.33        64
           1       0.99      0.71      0.83       761

    accuracy                           0.72       825
   macro avg       0.60      0.80      0.58       825
weighted avg       0.93      0.72      0.79       825



### ストップワード

In [14]:
# !curl -o /app/tmp/stopwords.txt http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt

In [15]:
df_stopwords = pandas.read_csv('/app/tmp/stopwords.txt', header=None)
stopwords = list(df_stopwords[0])
Trainer(x_train, y_train, x_test, y_test,
    vectorizer=CountVectorizer(tokenizer=tokenize, stop_words=stopwords)).trailn_and_eval()




roc_auc_score (macro average) 0.7878408344283838
tn, fp, fn, tp 54 10 204 557
              precision    recall  f1-score   support

           0       0.21      0.84      0.34        64
           1       0.98      0.73      0.84       761

    accuracy                           0.74       825
   macro avg       0.60      0.79      0.59       825
weighted avg       0.92      0.74      0.80       825



### TF-IDF

In [16]:
Trainer(x_train, y_train, x_test, y_test,
    vectorizer=TfidfVectorizer(tokenizer=tokenize)).trailn_and_eval()

roc_auc_score (macro average) 0.8133931504599212
tn, fp, fn, tp 55 9 177 584
              precision    recall  f1-score   support

           0       0.24      0.86      0.37        64
           1       0.98      0.77      0.86       761

    accuracy                           0.77       825
   macro avg       0.61      0.81      0.62       825
weighted avg       0.93      0.77      0.82       825



In [17]:
tfidf_trainer = Trainer(x_train, y_train, x_test, y_test, vectorizer=TfidfVectorizer(tokenizer=tokenize))
tfidf_trainer.train()

LogisticRegression(class_weight={0: 0.004484304932735426,
                                 1: 0.00032499187520311994},
                   solver='liblinear')

In [18]:
tfidf_trainer.predict(['プリンとイーブイかわいい'])

array([[0.49974796, 0.50025204]])

In [19]:
tfidf_trainer.predict(['焼きプリンおいしい'])

array([[0.50026531, 0.49973469]])