In [1]:
import pandas
from IPython.display import Markdown, display
from janome.tokenizer import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

In [2]:
# データセット自体の再配布は禁止なので、Gitコミット時に出力を消す
def no_redistribution(data):
    pass
    # return data

## 学習コードの実装

In [3]:
def load_dataset(dataset_tsv_path):
    df = df = pandas.read_table(dataset_tsv_path)
    df = df.dropna(subset=['label'])
    print('length: ', len(df))
    return df['text'].values, df['label'].values

In [4]:
no_redistribution(load_dataset('/app/data/purin.tsv'))

length:  4125


In [5]:
t = Tokenizer()
def tokenize(text):
    return t.tokenize(text, wakati=True)

list(tokenize("今日のご飯は焼肉です"))

['今日', 'の', 'ご飯', 'は', '焼肉', 'です']

In [6]:
from numpy import count_nonzero


class BaseDisambiguator():
    def __init__(self, x_train, y_train, x_test, y_test) -> None:
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

    def train(self):
        pass

    def predict(self, xs: list[str]) -> list[float]:
        return self.predict_proba(xs)

    def predict_proba(self, xs: list[str]) -> list[int]:
        return [ 1 for _x in xs ]

    def eval_print(self):
        y_pred = self.predict(self.x_test)
        display(Markdown('#### ROC AUC score (macro average)'))
        print(roc_auc_score(self.y_test, y_pred, average='macro'))

        display(Markdown('#### confusion matrix'))
        labels = [0,1]
        cm = confusion_matrix(self.y_test, y_pred, labels=labels)
        display(pandas.DataFrame(cm,
            columns=[["Predicted"] * len(labels), labels],
            index=[["Actual"] * len(labels), labels])
        )

        display(Markdown('#### classification report'))
        print(classification_report(self.y_test, y_pred))
    
    def train_and_eval(self):
        self.train()
        self.eval_print()


In [8]:
x, y = load_dataset('/app/data/purin.tsv')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

length:  4125


In [9]:
base_trainer = BaseDisambiguator(x_train, y_train, x_test, y_test)

In [10]:
base_trainer.train_and_eval()

#### ROC AUC score (macro average)

0.5


#### confusion matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
Actual,0,0,64
Actual,1,0,761


#### classification report

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        64
           1       0.92      1.00      0.96       761

    accuracy                           0.92       825
   macro avg       0.46      0.50      0.48       825
weighted avg       0.85      0.92      0.89       825



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
from numpy import count_nonzero


class CountDisambiguator(BaseDisambiguator):
    def __init__(self, x_train, y_train, x_test, y_test, vectorizer=CountVectorizer()) -> None:
        super().__init__(x_train, y_train, x_test, y_test)
        self.vectorizer = vectorizer

    def train(self):
        x_train_vec = self.vectorizer.fit_transform(self.x_train)
        # 不均衡データセットなので重みをつける
        weights = {
            0: 1 / count_nonzero(y_train == 0),
            1: 1 / count_nonzero(y_train == 1),
        }
        self.model = LogisticRegression(solver='liblinear', class_weight=weights)
        self.model.fit(x_train_vec, self.y_train)
        return self.model

    def predict(self, xs):
        xs = self.vectorizer.transform(xs)
        return self.model.predict(xs)

    def predict_proba(self, xs: list[str]):
        xs = self.vectorizer.transform(xs)
        return [ x[1] for x in self.model.predict_proba(xs)]


In [12]:
trainer = CountDisambiguator(x_train, y_train, x_test, y_test, vectorizer=CountVectorizer(tokenizer=tokenize))

model = trainer.train()

In [13]:
trainer.eval_print()

#### ROC AUC score (macro average)

0.7891548948751643


#### confusion matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
Actual,0,54,10
Actual,1,202,559


#### classification report

              precision    recall  f1-score   support

           0       0.21      0.84      0.34        64
           1       0.98      0.73      0.84       761

    accuracy                           0.74       825
   macro avg       0.60      0.79      0.59       825
weighted avg       0.92      0.74      0.80       825



In [14]:
trainer.predict(['ポケモンのプリンとニャースとイーブイかわいい'])

array([1])

In [15]:
trainer.predict(['セブンのスイーツの焼きプリンおいしい'])

array([0])

## 前処理の比較

In [16]:
CountDisambiguator(x_train, y_train, x_test, y_test, vectorizer=CountVectorizer(tokenizer=tokenize)).train_and_eval()

#### ROC AUC score (macro average)

0.7891548948751643


#### confusion matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
Actual,0,54,10
Actual,1,202,559


#### classification report

              precision    recall  f1-score   support

           0       0.21      0.84      0.34        64
           1       0.98      0.73      0.84       761

    accuracy                           0.74       825
   macro avg       0.60      0.79      0.59       825
weighted avg       0.92      0.74      0.80       825



### Twitter特有表現の除去

In [17]:
import re
def remove_anchor(text):
    return re.sub(r'\@[a-zA-Z0-9]+', '', text)

CountDisambiguator(x_train, y_train, x_test, y_test,
    vectorizer=CountVectorizer(tokenizer=tokenize, preprocessor=remove_anchor)).train_and_eval()

#### ROC AUC score (macro average)

0.7977681504599212


#### confusion matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
Actual,0,53,11
Actual,1,177,584


#### classification report

              precision    recall  f1-score   support

           0       0.23      0.83      0.36        64
           1       0.98      0.77      0.86       761

    accuracy                           0.77       825
   macro avg       0.61      0.80      0.61       825
weighted avg       0.92      0.77      0.82       825



### Min df

In [18]:
# minimum document frequency. see: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
CountDisambiguator(x_train, y_train, x_test, y_test,
    vectorizer=CountVectorizer(tokenizer=tokenize, min_df=3)).train_and_eval()

#### ROC AUC score (macro average)

0.785869743758213


#### confusion matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
Actual,0,54,10
Actual,1,207,554


#### classification report

              precision    recall  f1-score   support

           0       0.21      0.84      0.33        64
           1       0.98      0.73      0.84       761

    accuracy                           0.74       825
   macro avg       0.59      0.79      0.58       825
weighted avg       0.92      0.74      0.80       825



### Max df

In [19]:
CountDisambiguator(x_train, y_train, x_test, y_test,
    vectorizer=CountVectorizer(tokenizer=tokenize, max_df=0.7)).train_and_eval()

#### ROC AUC score (macro average)

0.800108820630749


#### confusion matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
Actual,0,57,7
Actual,1,221,540


#### classification report

              precision    recall  f1-score   support

           0       0.21      0.89      0.33        64
           1       0.99      0.71      0.83       761

    accuracy                           0.72       825
   macro avg       0.60      0.80      0.58       825
weighted avg       0.93      0.72      0.79       825



### ストップワード

In [20]:
# !curl -o /app/tmp/stopwords.txt http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt

In [21]:
df_stopwords = pandas.read_csv('/app/tmp/stopwords.txt', header=None)
stopwords = list(df_stopwords[0])
CountDisambiguator(x_train, y_train, x_test, y_test,
    vectorizer=CountVectorizer(tokenizer=tokenize, stop_words=stopwords)).train_and_eval()




#### ROC AUC score (macro average)

0.7878408344283838


#### confusion matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
Actual,0,54,10
Actual,1,204,557


#### classification report

              precision    recall  f1-score   support

           0       0.21      0.84      0.34        64
           1       0.98      0.73      0.84       761

    accuracy                           0.74       825
   macro avg       0.60      0.79      0.59       825
weighted avg       0.92      0.74      0.80       825



### TF-IDF

In [22]:
CountDisambiguator(x_train, y_train, x_test, y_test,
    vectorizer=TfidfVectorizer(tokenizer=tokenize)).train_and_eval()

#### ROC AUC score (macro average)

0.8133931504599212


#### confusion matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted,Predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
Actual,0,55,9
Actual,1,177,584


#### classification report

              precision    recall  f1-score   support

           0       0.24      0.86      0.37        64
           1       0.98      0.77      0.86       761

    accuracy                           0.77       825
   macro avg       0.61      0.81      0.62       825
weighted avg       0.93      0.77      0.82       825



In [23]:
tfidf_trainer = CountDisambiguator(x_train, y_train, x_test, y_test, vectorizer=TfidfVectorizer(tokenizer=tokenize))
tfidf_trainer.train()

LogisticRegression(class_weight={0: 0.004484304932735426,
                                 1: 0.00032499187520311994},
                   solver='liblinear')

In [24]:
tfidf_trainer.predict_proba(['プリンとイーブイかわいい'])

[0.5002520376790113]

In [25]:
tfidf_trainer.predict_proba(['焼きプリンおいしい'])

[0.49973469342104543]