In [1]:
# SNOW E19:話題に基づく語義曖昧性解消評価セット [言語商会] https://www.jnlp.org/GengoHouse/snow/e19
# 「ウイルス」が、「コンピュータ」「医療（その他）」のいずれか
!curl -o /app/tmp/wsd-dataset.tsv https://raw.githubusercontent.com/nut-jnlp/JapaneseTopicWSD/master/WSD_Dataset/%E3%82%A6%E3%82%A4%E3%83%AB%E3%82%B9.tsv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  132k  100  132k    0     0   437k      0 --:--:-- --:--:-- --:--:--  436k


In [2]:
import pandas
from janome.tokenizer import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# データセット自体の再配布は禁止なので、Gitコミット時に出力を消す
def no_redistribution(data):
    pass
    # return data

In [4]:

df = pandas.read_csv('/app/tmp/wsd-dataset.tsv', sep='\t', skiprows=1, names=('label', 'keyword', 'text'))
no_redistribution(df.head())

In [5]:
no_redistribution(df['label'].apply(lambda x: 1 if x == "コンピュータ" else 0))

In [6]:
no_redistribution(train_test_split(df))

In [7]:
def load_dataset():
    df = pandas.read_csv('/app/tmp/wsd-dataset.tsv', sep='\t', skiprows=1, names=('label', 'keyword', 'text'))
    df['label'] = df['label'].apply(lambda x: 1 if x == "コンピュータ" else 0)
    return df['text'].values, df['label'].values

In [8]:
t = Tokenizer()
def tokenize(text):
    return t.tokenize(text, wakati=True)

list(tokenize("今日のご飯は焼肉です"))

['今日', 'の', 'ご飯', 'は', '焼肉', 'です']

In [9]:
x, y = load_dataset()

In [10]:
class Trainer():
    def __init__(self, x_train, y_train, x_test, y_test,
        lowercase=False, tokenize=None, preprocessor=None) -> None:
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.lowercase = lowercase
        self.tokenize = tokenize
        self.preprocessor = preprocessor
        self.vectorizer = CountVectorizer(lowercase=self.lowercase,
            tokenizer=self.tokenize,
            preprocessor=self.preprocessor)

    def train(self):
        x_train_vec = self.vectorizer.fit_transform(self.x_train)
        self.model = LogisticRegression(solver='liblinear')
        self.model.fit(x_train_vec, self.y_train)
        return self.model

    def predict(self, x, proba = True):
        x_vec = self.vectorizer.transform(x)
        if proba:
            predictor = self.model.predict_proba
        else:
            predictor = self.model.predict
        return predictor(x_vec)

    def eval_and_print(self):
        y_pred = self.predict(self.x_test, proba=False)
        score = accuracy_score(self.y_test, y_pred)
        print('{:.4f}'.format(score))


In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [12]:
trainer = Trainer(x_train, y_train, x_test, y_test, tokenize=tokenize)

model = trainer.train()

In [13]:
trainer.eval_and_print()

0.8681


In [14]:
trainer.predict(['このパソコンはウイルスに感染しました'])

array([[0.44765467, 0.55234533]])

In [15]:
trainer.predict(['ウイルスが飛散しています'])

array([[0.64684021, 0.35315979]])