In [1]:
import re
import numpy as np
import pandas as pd
from sudachipy import tokenizer
from sudachipy import dictionary
from scipy.sparse import vstack as svstack
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
tokenizer_obj = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C

In [2]:
data = []
regs = [r"[@#].+? ", r"http.+? "]
labels = list(zip("happy,sad,disgust,angry,surprise,fear".split(","), "😊 😢 🤢 😠 😮 😨".split(" ")))
with open("../tweets.txt", "r") as f:
    for line in f:
        line = line.split("[:::]")[0]
        x = ' '.join(line.strip().split()[5:])+" "
        for r in regs:
            x = re.sub(r, ' ', x)
        x = ' '.join([m.dictionary_form() for m in tokenizer_obj.tokenize(x, mode)]).strip()
        y = {lab[0]: (lab[1] in x) for lab in labels}
        for lab in labels:
            x = x.replace(lab[1], " ")
        if "😐" not in x and not any(a for _, a in y.items()):
            continue
        y["text"] = x
        data.append(y)
df = pd.DataFrame(data)
del data
#df.head()

In [3]:
X = df["text"]
vect = TfidfVectorizer(min_df=3, max_df=0.3).fit(X)
X_vec = vect.transform(X)
models = []
for c in df.columns[:-1]:
    y = df[c]
    inds1 = np.where(y==True)[0]
    inds2 = np.random.choice(np.where(y==False)[0], len(inds1))
    X_sel = svstack([X_vec[inds1], X_vec[inds2]])
    y_sel = np.array(y[inds1].tolist() + y[inds2].tolist())
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y_sel, shuffle=True, test_size=0.01)
    clf = LogisticRegression(solver="liblinear", penalty="l1").fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    models.append((c, clf))

              precision    recall  f1-score   support

       False       0.79      0.80      0.79    112085
        True       0.80      0.78      0.79    111253

    accuracy                           0.79    223338
   macro avg       0.79      0.79      0.79    223338
weighted avg       0.79      0.79      0.79    223338

              precision    recall  f1-score   support

       False       0.76      0.78      0.77     23322
        True       0.78      0.76      0.77     23509

    accuracy                           0.77     46831
   macro avg       0.77      0.77      0.77     46831
weighted avg       0.77      0.77      0.77     46831

              precision    recall  f1-score   support

       False       0.76      0.73      0.75       720
        True       0.74      0.77      0.76       723

    accuracy                           0.75      1443
   macro avg       0.75      0.75      0.75      1443
weighted avg       0.75      0.75      0.75      1443

              preci

In [4]:
import pickle
with open("model.pkl", "wb") as f:
    pickle.dump((vect, tuple(models)), f)

In [5]:
def tok(x):
    return ' '.join([m.dictionary_form() for m in tokenizer_obj.tokenize(x, mode)]).strip()

texts = [
    "ティファの技って面白いですね",
    "エアリスが死んで悲しいよ",
    "クラウドって自称ソルジャーとかキモいね",
    "バレットって叫んでばっかでうるさくてムカつく",
    "セフィロスチョー強くてびっくり",
    "タークスとかいう闇組織こわ",]
v = vect.transform(tok(x) for x in texts)
for n, m in models:
    print(n, m.predict_proba(v)[:,1])

happy [0.71784105 0.01217588 0.24688099 0.07045632 0.27956653 0.18828918]
sad [0.16119984 0.98623918 0.36672844 0.23190665 0.390584   0.53631942]
disgust [0.41737333 0.65189701 0.914085   0.80733884 0.62794515 0.65786511]
angry [0.42242915 0.287903   0.80176931 0.99917872 0.54210019 0.63226186]
surprise [0.57379286 0.21409382 0.69020821 0.66950584 0.86880499 0.64293575]
fear [0.35976251 0.52168146 0.82516612 0.40859211 0.89302586 0.89642556]
