In [None]:
import pandas as pd
import numpy as np
from multi_label_classification_model_wrapper import MultiLabelClassificationModelWrapper
from mbti_util import MbtiUtil
from denoicer import Denoicer

In [None]:
mbti_util = MbtiUtil()
denoice = Denoicer()

In [None]:
def read_tweets_with_before_after(isReply=False):
    path = './database/tweets_with_before_or_after.tsv'
    df = pd.read_csv(path, sep='\t')
    if isReply:
        df = df[df['tweet_text'].str.match('@(\w+) ')]
    else:
        df = df[df['tweet_text'].str.match('(?!@(\w+) )')]
    return df

df = read_tweets_with_before_after(isReply=False)
df.head(5)

In [None]:
def map_emo_result(predictions) -> list:
    emostr = 'Joy Sadness Anticipation Surprise Anger Fear Disgust Trust'
    emos = emostr.split(' ')
    emo_results = []
    for p in predictions:
        rese = []
        for ei, x in enumerate(p):
            if x > 0.0:
                rese.append(emos[ei])
        es = 'NULL'
        if len(rese) > 0:
            es = '|'.join(rese)
        emo_results.append(es)
    return emo_results

def outputs_emotion_estimate(path, emo_results, raw_outputs, tweet_ids, empty_tweet_index):
    with open(path, 'w', encoding='utf-8') as f:
        f.write('tweet_id\temo\tvec\n')
        for i, (emo, vec, id) in enumerate(zip(emo_results, raw_outputs, tweet_ids)):
            if i not in empty_tweet_index:
                vec = f'{vec[0]},{vec[1]},{vec[2]},{vec[3]},{vec[4]},{vec[5]},{vec[6]},{vec[7]}'
                f.write(f'{id}\t{emo}\t{vec}\n')
    print(f'complete {path}')

# 作成したモデルからツイートを推定し、tsv ファイルに出力する
def estimate_tweets_and_output(df, model_type, model_name, save_path):
    to_predict = df['tweet_text'].values
    to_predict = list(map(lambda x: denoice.normalize_text(x), to_predict))

    # 正規化後に空になったツイートのインデックスのリストを作成
    empty_tweet_index = []
    for i, pred in enumerate(to_predict):
        if pred == '':
            empty_tweet_index.append(i)

    model = MultiLabelClassificationModelWrapper(
        model_type = model_type,
        model_name = model_name,
        num_labels=8,
        use_cuda = True
    )
    predictions, raw_outputs = model.predict(to_predict)
    emo_results = map_emo_result(predictions)

    outputs_emotion_estimate(save_path, emo_results, raw_outputs, df['tweet_id'].values, empty_tweet_index)

'''
以下のいずれかのコメントを外し推定するモデルを選択する
'''
# model_type, model_name = ('roberta_waseda_ja', './model/trained/nlp-waseda-roberta-base-japanese/checkpoint-7555-epoch-5')
model_type, model_name = ('twhinbert', './model/trained/Twitter-twhin-bert-base/checkpoint-7555-epoch-5')
# model_type, model_name = ('bert', './model/trained/cl-tohoku-bert-base-japanese-whole-word-masking/checkpoint-7555-epoch-5')
# model_type, model_name = ('xlnet', './model/trained/hajime9652-xlnet-japanese/checkpoint-7555-epoch-5')
# model_type, model_name = ('xlmroberta', './model/trained/xlm-roberta-base/checkpoint-7555-epoch-5')

name = model_name.split('/')[3]
save_path = f'./database/emotion/estimate_result/{name}.tsv'
estimate_tweets_and_output(df=df, model_type=model_type, model_name=model_name, save_path=save_path)