In [1]:
import pandas
from janome.tokenizer import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix

In [2]:
# データセット自体の再配布は禁止なので、Gitコミット時に出力を消す
def no_redistribution(data):
    pass
    # return data

In [3]:
df = pandas.read_csv('/app/data/purin-tsv.tsv', sep='\t', skiprows=1, names=('id', 'tweet_id', 'text', 'url', 'label', 'keyword'))
no_redistribution(df.head())

In [4]:
def load_dataset():
    df = df = pandas.read_csv('/app/data/purin-tsv.tsv', sep='\t', skiprows=1, names=('id', 'tweet_id', 'text', 'url', 'label', 'keyword'))
    df = df.dropna(subset=['label'])
    print('length: ', len(df))
    return df['text'].values, df['label'].values

In [5]:
no_redistribution(load_dataset())

length:  303


In [6]:
t = Tokenizer()
def tokenize(text):
    return t.tokenize(text, wakati=True)

list(tokenize("今日のご飯は焼肉です"))

['今日', 'の', 'ご飯', 'は', '焼肉', 'です']

In [7]:
class Trainer():
    def __init__(self, x_train, y_train, x_test, y_test,
        lowercase=False, tokenize=None, preprocessor=None) -> None:
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.lowercase = lowercase
        self.tokenize = tokenize
        self.preprocessor = preprocessor
        self.vectorizer = CountVectorizer(lowercase=self.lowercase,
            tokenizer=self.tokenize,
            preprocessor=self.preprocessor)

    def train(self):
        x_train_vec = self.vectorizer.fit_transform(self.x_train)
        # Positiveが極端に少ない不均衡データセットなので重みをつける。 TODO: ちゃんと計算する
        weights = {
            0: 1 / 287,
            1: 1 / 16,
        }
        self.model = LogisticRegression(solver='liblinear', class_weight=weights)
        self.model.fit(x_train_vec, self.y_train)
        return self.model

    def predict(self, x, proba = True):
        x_vec = self.vectorizer.transform(x)
        if proba:
            predictor = self.model.predict_proba
        else:
            predictor = self.model.predict
        return predictor(x_vec)

    def eval_and_print(self):
        y_pred = self.predict(self.x_test, proba=False)
        print('accuracy_score', accuracy_score(self.y_test, y_pred))
        print('precision_score', precision_score(self.y_test, y_pred))
        print('recall_score', recall_score(self.y_test, y_pred))
        print('roc_auc_score', roc_auc_score(self.y_test, y_pred))
        print('f1_score', f1_score(self.y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(self.y_test, y_pred).ravel()
        print('tn, fp, fn, tp', tn, fp, fn, tp)


In [8]:
x, y = load_dataset()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
trainer = Trainer(x_train, y_train, x_test, y_test, tokenize=tokenize)

model = trainer.train()

length:  303


In [9]:
trainer.eval_and_print()

accuracy_score 0.8524590163934426
precision_score 0.0
recall_score 0.0
roc_auc_score 0.4642857142857143
f1_score 0.0
tn, fp, fn, tp 52 4 5 0


In [10]:
trainer.predict(['ポケモンのプリンとニャースとイーブイかわいい'])

array([[0.49908017, 0.50091983]])

In [11]:
trainer.predict(['焼きプリンおいしい'])

array([[0.53211884, 0.46788116]])