#### Twitter BERT と xlm-roberta-base でコロナ禍後のツイートにおいて Joy と Anticipation の割合が増加した。
#### コロナ禍前の Joy, Anticipation のツイートとコロナ禍後の Joy, Anticipation のツイートを抜き出し、そこから特徴語を抽出する

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from tqdm import tqdm
from denoicer import Denoicer
from mecab_wakati import MecabWakati

In [None]:
denoicer = Denoicer()
mecab_wakati = MecabWakati()

In [None]:
def read_tweets_with_before_after(isReply=False):
    path = './database/tweets_with_before_or_after.tsv'
    df = pd.read_csv(path, sep='\t')
    if isReply:
        df = df[df['tweet_text'].str.match('@(\w+) ')]
    else:
        df = df[df['tweet_text'].str.match('(?!@(\w+) )')]
    return df


df = read_tweets_with_before_after(isReply=False)
df.head(5)


In [None]:
# emotion を指定してコロナ禍前後で分けて辞書方でツイートを返す
def filter_tweets_by_emotion(emotion: str, df: pd.DataFrame, df_emo: pd.DataFrame) -> dict:
    results = {
        'before': [],
        'after': []
    }
    for tw_id, emo in zip(df_emo['tweet_id'], df_emo['emo']):
        if type(emo) != float:
            emos = emo.strip().split('|')
            if emotion in emos:
                d = df[df['tweet_id'] == tw_id]
                before_or_after = d['before_or_after'].values[0]
                tweet = d['tweet_text'].values[0]
                results[before_or_after].append(tweet)
    return results

# 取得したツイートを出力
def output_tweets(path, save_folder_path, tweets: list):
    if not os.path.exists(save_folder_path):
        os.makedirs(save_folder_path)
    with open(path, 'w', encoding='utf8') as f:
        for tweet in tweets:
            f.write(f'{tweet}\n')


'''
tsv_paths = ['./database/emotion/estimate_result/tweet/xlm-roberta-base.tsv',
         './database/emotion/estimate_result/tweet/Twitter-twhin-bert-base.tsv']
before_or_after = ['before', 'after']
emos = ['Joy', 'Anticipation']

for tsv_path in tsv_paths:
    df_emo = pd.read_csv(tsv_path, sep='\t')
    for emo in emos:
        dic = filter_tweets_by_emotion(emo, df=df, df_emo=df_emo)
        for ba in before_or_after:
            tweets = dic[ba]
            model_name = tsv_path.split('/')[-1][:-4]
            save_folder_path = f'./database/emotion/each_emotion_tweets/{model_name}/{ba}'
            path = f'{save_folder_path}/{emo}_tweets.txt'
            output_tweets(path=path,save_folder_path=save_folder_path,tweets=tweets)
'''


In [None]:
# Bag-of-Words ベクトルを付与し頻出単語をアウトプット
def create_bag_of_words_vector_and_output(path, output_path):
    docs = []
    with open(path, 'r', encoding='utf8') as f:
        tweets = f.readlines()
        for tweet in tweets:
            tweet = denoicer.normalize_text(tweet.strip())
            tweet = mecab_wakati.wakati_sentence(tweet)
            docs.append(' '.join(tweet))

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(docs)
    values = X.toarray()
    feature_names = {}
    for k, v in vectorizer.vocabulary_.items():
        feature_names[v] = k
        
    outputs = []
    dic = {}
    for value in tqdm(values):
        for i, v in enumerate(value):
            if v > 0:
                word = feature_names[i]
                if word not in dic:
                    dic[word] = v
                else:
                    dic[word] += v
    top20 = []
    for k, v in sorted(dic.items(), key=lambda x: x[1], reverse=True):
        if len(top20) >= 20:
            break
        if k not in denoicer.stop_words:
            top20.append([k, v])

    for top in top20:
        outputs.append(f'{top[0]}: {top[1]}\n')

    if not os.path.exists('./covid-19_feature_words/emotion/bag-of-words'):
        os.makedirs('./covid-19_feature_words/emotion/bag-of-words')
    with open(output_path, 'w', encoding='utf8') as f:
        for o in outputs:
            f.write(o)

'''
paths = ['./database/emotion/each_emotion_tweets/Twitter-twhin-bert-base',
         './database/emotion/each_emotion_tweets/xlm-roberta-base']
before_or_after = ['before', 'after']
emos = ['Joy', 'Anticipation']
for path in paths:
    for ba in before_or_after:
        for emo in emos:
            p = f'{path}/{ba}/{emo}_tweets.txt'
            model_name = path.split('/')[-1]
            output_path = f'./covid-19_feature_words/emotion/bag-of-words/{emo}_{ba}_{model_name}.txt'
            create_bag_of_words_vector_and_output(path=p, output_path=output_path)
'''

In [None]:
# Bag-of-Words の特徴語から WordCloud を作成
def create_wordcloud_bag_of_words(path, save_path_folder, emo, before_or_after, model_name):
    word_cloud = WordCloud(width=480, height=320, background_color="white", font_path='/usr/share/fonts/truetype/takao-gothic/TakaoGothic.ttf')
    dic = {}
    with open(path, 'r', encoding='utf8') as f:
        lines = f.readlines()
        word_tmp = []
        values_tmp = []
        for l in lines:
            split = l.strip().replace('\t', '').replace(' ', '').split(':')
            word_tmp.append(split[0])
            values_tmp.append(float(split[1]))
    for w, v in zip(word_tmp, values_tmp):
        dic[w] = v
    if not os.path.exists(save_path_folder):
        os.makedirs(save_path_folder)
    result = word_cloud.generate_from_frequencies(dic)
    with open(f'{save_path_folder}/{emo}_{before_or_after}_{model_name}.svg', 'w', encoding='utf-8') as svg:
        svg.write(result.to_svg())

'''
models = ['Twitter-twhin-bert-base', 'xlm-roberta-base']
before_or_after = ['before', 'after']
emos = ['Joy', 'Anticipation']

for model in models:
    for ba in before_or_after:
        for emo in emos:
            path = f'covid-19_feature_words/emotion/bag-of-words/{emo}_{ba}_{model}.txt'
            create_wordcloud_bag_of_words(path=path, save_path_folder='covid-19_feature_words/emotion/bag-of-words/word_cloud', emo=emo, before_or_after=ba, model_name=model)
'''

In [None]:
# TF-IDF ベクトルを付与し各MBTIタイプの特徴語をアウトプット
def create_tf_idf_vector_and_output(base_path, emo, model, save_folder_path, max_df=0.9):
    docs = []
    before_or_after = ['before', 'after']
    for ba in before_or_after:
        path = f'{base_path}/{ba}/{emo}_tweets.txt'
        with open(path, 'r', encoding='utf8') as f:
            tweets = f.readlines()
            for tweet in tqdm(tweets):
                tweet = denoicer.normalize_text(tweet.strip())
                tweet = mecab_wakati.wakati_sentence(tweet)
            docs.append(' '.join(tweet))

    vectorizer = TfidfVectorizer(max_df=max_df)  # 文書全体の90%以上で出現する単語は無視する
    X = vectorizer.fit_transform(docs)
    values = X.toarray()
    feature_names = vectorizer.get_feature_names()

    for ba, vec in zip(before_or_after, values):
        outputs = []
        for w_id, tfidf in sorted(enumerate(vec), key=lambda x: x[1], reverse=True)[:20]:
            word = feature_names[w_id]
            outputs.append('{0:s}: {1:f}\n'.format(word, tfidf))
        if not os.path.exists(save_folder_path):
            os.makedirs(save_folder_path)
        with open(f'{save_folder_path}/{emo}_{ba}_{model}_feature_words_top_20.txt', 'w', encoding='utf8') as f:
            for o in outputs:
                f.write(o)

'''
models = ['Twitter-twhin-bert-base', 'xlm-roberta-base']
emos = ['Joy', 'Anticipation']

for model in models:
    for emo in emos:
        base_path = f'./database/emotion/each_emotion_tweets/{model}'
        save_folder_path = './covid-19_feature_words/emotion/tf-idf'
        create_tf_idf_vector_and_output(base_path=base_path, emo=emo, model=model, save_folder_path=save_folder_path)
'''

In [None]:
# TF-IDF の特徴語から WordCloud を作成
def create_wordcloud_tf_idf(path, save_path_folder, emo, before_or_after, model_name):
    word_cloud = WordCloud(width=480, height=320, background_color="white", font_path='/usr/share/fonts/truetype/takao-gothic/TakaoGothic.ttf')
    dic = {}
    with open(path, 'r', encoding='utf8') as f:
        lines = f.readlines()
        word_tmp = []
        values_tmp = []
        for l in lines:
            split = l.strip().replace('\t', '').replace(' ', '').split(':')
            if float(split[1]) != 0:
                word_tmp.append(split[0])
                values_tmp.append(float(split[1]))
        for w, v in zip(word_tmp, values_tmp):
            for _ in range(int(1*float(v)/float(values_tmp[len(word_tmp)-1]))):
                if w not in dic:
                    dic[w] = 1
                else:
                    dic[w] += 1
    if not os.path.exists(save_path_folder):
        os.makedirs(save_path_folder)
    result = word_cloud.generate_from_frequencies(dic)
    with open(f'{save_path_folder}/{emo}_{before_or_after}_{model_name}.svg', 'w', encoding='utf-8') as svg:
        svg.write(result.to_svg())


models = ['Twitter-twhin-bert-base', 'xlm-roberta-base']
before_or_after = ['before', 'after']
emos = ['Joy', 'Anticipation']

for model in models:
    for ba in before_or_after:
        for emo in emos:
            path = f'covid-19_feature_words/emotion/tf-idf/{emo}_{ba}_{model}_feature_words_top_20.txt'
            create_wordcloud_tf_idf(path=path, save_path_folder='covid-19_feature_words/emotion/tf-idf/word_cloud', emo=emo, before_or_after=ba, model_name=model)