In [15]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import  fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn import svm

In [2]:
from __future__ import unicode_literals
import re
import unicodedata

def unicode_normalize(cls, s):
    pt = re.compile('([{}]+)'.format(cls))

    def norm(c):
        return unicodedata.normalize('NFKC', c) if pt.match(c) else c

    s = ''.join(norm(x) for x in re.split(pt, s))
    s = re.sub('－', '-', s)
    return s

def remove_extra_spaces(s):
    s = re.sub('[ 　]+', ' ', s)
    blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
                      '\u3040-\u309F',  # HIRAGANA
                      '\u30A0-\u30FF',  # KATAKANA
                      '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
                      '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
                      ))
    basic_latin = '\u0000-\u007F'

    def remove_space_between(cls1, cls2, s):
        p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
        while p.search(s):
            s = p.sub(r'\1\2', s)
        return s

    s = remove_space_between(blocks, blocks, s)
    s = remove_space_between(blocks, basic_latin, s)
    s = remove_space_between(basic_latin, blocks, s)
    return s

def normalize_neologd(s):
    s = s.strip()
    s = unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)

    def maketrans(f, t):
        return {ord(x): ord(y) for x, y in zip(f, t)}

    s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
    s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
    s = re.sub('[~∼∾〜〰～]+', '〜', s)  # normalize tildes (modified by Isao Sonobe)
    s = s.translate(
        maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣',
              '！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))

    s = remove_extra_spaces(s)
    s = unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
    s = re.sub('[’]', '\'', s)
    s = re.sub('[”]', '"', s)
    return s

In [3]:
import tarfile
import re

target_genres = ["dokujo-tsushin",
                 "it-life-hack",
                 "kaden-channel",
                 "livedoor-homme",
                 "movie-enter",
                 "peachy",
                 "smax",
                 "sports-watch",
                 "topic-news"]

def remove_brackets(text):
    text = re.sub(r"(^【[^】]*】)|(【[^】]*】$)", "", text)
    return text

def normalize_text(text):
    assert "\n" not in text and "\r" not in text
    text = text.replace("\t", " ")
    text = text.strip()
    text = normalize_neologd(text)
    text = text.lower()
    return text

def read_title_body(file):
    next(file)
    next(file)
    title = next(file).decode("utf-8").strip()
    title = normalize_text(remove_brackets(title))
    body = normalize_text(" ".join([line.decode("utf-8").strip() for line in file.readlines()]))
    return title, body

In [3]:
import MeCab
def wakati_pred(text):
    #tagger = MeCab.Tagger('')
    tagger = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd -u /mnt/data1/home/ooga/mydic/foo.dic')
    tagger.parse('')
    node = tagger.parseToNode(text)
    stopword = ["これ", "ちょ","さん"]
    through = ["好き", "格好","すき","嫌","かわい","かわえー","だいすきだー","大好き","イケメン","IKEMEN","大好きだー","きもい","きっも"]
    word_list = []
    meishi = 0
    other = 0
    while node:
        pos = node.feature.split(",")[0]
        stop = node.feature.split(",")[6]
        #print(pos,stop)
        #if pos in ["動詞", "形容詞","感動詞"]:
        if (pos in ["名詞","動詞", "形容詞","感動詞","助動詞","助詞","副詞","フィラー"]):
        #["動詞", "形容詞","感動詞","名詞", "副詞", "助詞", "接続詞", "助動詞", "連体詞", "感動詞"]:
            word = node.surface
            word_list.append(word)
            if(pos == "名詞" and (stop not in through)):
                meishi += 1
            else:
                other += 1
        node = node.next
    #print(meishi, other)
    if meishi == 1 and other == 0:
        word_list = []
        #print("消す")
    return " ".join(word_list)

def concat_df(main_df, sub_df):
    df_list = [main_df, sub_df]
    df_concat = pd.concat(df_list, ignore_index=True)
    return df_concat

In [5]:
def wakati_all(text):
    #tagger = MeCab.Tagger('')
    tagger = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd -u /mnt/data1/home/ooga/mydic/foo.dic')
    tagger.parse('')
    node = tagger.parseToNode(text)
    stopword = ["これ", "ちょ","さん"]
    word_list = []
    while node:
        pos = node.feature.split(",")[0]
        stop = node.feature.split(",")[6]
        if pos in ["動詞", "形容詞","感動詞","名詞", "副詞", "助詞", "接続詞", "助動詞", "連体詞", "感動詞","フィラー"]:
            word = node.surface
            word_list.append(word)
        node = node.next
    return " ".join(word_list)
    #return word_list

In [32]:
#ここから
df1 = pd.read_csv('ByT5/data/nico_20000_train.tsv', sep='\t', header=None)
df_1 = pd.read_csv('ByT5/data/nico_20000_dev.tsv', sep='\t', header=None)
df2 = pd.read_csv('ByT5/data/nico_20000_test.tsv', sep='\t', header=None)
df1 = concat_df(df1, df_1)

In [33]:
df1 = df1.rename(columns={0: "content", 1:"emotion"})
df2 = df2.rename(columns={0: "content", 1:"emotion"})

In [8]:
df1["wakati"] = df1["content"].apply(wakati_all)
df2["wakati"] = df2["content"].apply(wakati_all)

In [11]:
print(len(df1))
df1["emotion"].value_counts()

1    4272
0    4265
3    4255
2    4208
Name: emotion, dtype: int64

In [14]:
print(len(df2))
df2["emotion"].value_counts()

3000


2    792
3    745
0    735
1    728
Name: emotion, dtype: int64

In [10]:
#s_Train = df1.content.values #文章の抽出
s_Train = df1.wakati.values #わかち書き文章の抽出
l_Train = df1.emotion.values #ラベルの抽出
#s_Test = df2.content.values #文章の抽出
s_Test = df2.wakati.values #わかち書き後文章の抽出
l_Test = df2.emotion.values #ラベルの抽出

In [10]:
#コメントをbowでベクトル化し，学習するモデル
NB_model = make_pipeline(CountVectorizer(), MultinomialNB())
NB_model.fit(s_Train, l_Train)
#NB_model = make_pipeline(MultinomialNB())

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [11]:
##ここから word2vec + svm
import gensim
from gensim.models import word2vec 
import logging
import sys
import gensim.downloader as gendl
import pyemd
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
model = KeyedVectors.load_word2vec_format(
    datapath("/mnt/data1/home/ooga/models/jawiki.all_vectors.300d.txt"),
    binary=False
)

In [12]:
#ベクトル作成
def avg_feature_vector(comment, model, num_features=300):
    li = comment.split(" ")
    feature_vec = np.zeros((num_features,), dtype="float32") # 特徴ベクトルの入れ物を初期化
    feature_vec_ins = np.zeros(300, dtype="float32") # 特徴ベクトルの入れ物を初期化
    for word in li:
        if word in model:#辞書にない場合は省く
            feature_vec = np.add(feature_vec, model[word])
    if len(li) > 0:
        feature_vec = np.divide(feature_vec, len(li))
    return feature_vec

In [13]:
comm_train = []
for x in range(len(s_Train)):
    vec = avg_feature_vector(s_Train[x], model)
    comm_train.append(vec)
x_train = np.array(comm_train)

In [14]:
comm_train = []
for y in range(len(s_Test)):
    vec = avg_feature_vector(s_Test[y], model)
    comm_train.append(vec)
x_test = np.array(comm_train)

In [15]:
tuned_parameters = [
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]},
    ]

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
score = 'f1'
clf = GridSearchCV(
    SVC(), # 識別器 非線形SVM
    tuned_parameters, # 最適化したいパラメータセット 
    cv=5, # 交差検定の回数
    scoring='%s_weighted' % score ) # モデルの評価関数の指定

In [17]:
clf.fit(x_train, l_Train) #最適化実行

GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}],
             scoring='f1_weighted')

In [18]:
clf.best_params_

{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}

In [21]:
clf = svm.SVC(gamma=0.001, C=1000.,kernel="rbf")
clf.fit(x_train, l_Train)
##ここまで word2vec + SVM

SVC(C=1000.0, gamma=0.001)

In [22]:
l_Pred = clf.predict(x_test)

In [25]:
from sklearn.metrics import classification_report

# 評価レポート word2vec + svm
print(classification_report(l_Test, l_Pred))

              precision    recall  f1-score   support

           0       0.71      0.76      0.74       735
           1       0.75      0.74      0.75       728
           2       0.75      0.72      0.74       792
           3       0.75      0.73      0.74       745

    accuracy                           0.74      3000
   macro avg       0.74      0.74      0.74      3000
weighted avg       0.74      0.74      0.74      3000



In [24]:
#ここからsvm + bow

In [31]:
count = CountVectorizer()
count.fit(df1['wakati'].values)
x_train = count.transform(df1['wakati'].values)
x_test = count.transform(df2['wakati'].values)

In [35]:
tuned_parameters = [
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.0001]},
    ]

In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
score = 'f1'
clf = GridSearchCV(
    SVC(), # 識別器
    tuned_parameters, # 最適化したいパラメータセット 
    cv=5, # 交差検定の回数
    scoring='%s_weighted' % score ) # モデルの評価関数の指定

In [37]:
clf.fit(x_train, l_Train) #最適化実行

GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}],
             scoring='f1_weighted')

In [39]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [42]:
clf = svm.SVC(kernel="linear", C=1.)
clf.fit(x_train, l_Train)

SVC(kernel='linear')

In [43]:
l_Pred = clf.predict(x_test)

In [44]:
from sklearn.metrics import classification_report

# 評価レポート #bow + svm
print(classification_report(l_Test, l_Pred))

              precision    recall  f1-score   support

           0       0.70      0.82      0.75       735
           1       0.89      0.87      0.88       728
           2       0.87      0.83      0.85       792
           3       0.85      0.77      0.81       745

    accuracy                           0.82      3000
   macro avg       0.83      0.82      0.82      3000
weighted avg       0.83      0.82      0.82      3000



In [11]:
# 多項ナイーブベイズ 予測
l_Pred = NB_model.predict(s_Test)

In [18]:
# word2vec + svm
l_Pred = clf.predict(x_test)

In [19]:
from sklearn.metrics import classification_report
print(classification_report(l_Test, l_Pred))

              precision    recall  f1-score   support

           0       0.73      0.75      0.74      2425
           1       0.79      0.78      0.79      2460
           2       0.76      0.75      0.75      2534
           3       0.74      0.75      0.75      2563

    accuracy                           0.76      9982
   macro avg       0.76      0.76      0.76      9982
weighted avg       0.76      0.76      0.76      9982

