In [1]:
#!head -n 1 tweets.txt

In [2]:
#!cat gather.py

In [3]:
import re
import numpy as np
import pandas as pd
from sudachipy import tokenizer
from sudachipy import dictionary
from scipy.sparse import vstack as svstack
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
tokenizer_obj = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C

In [4]:
data = []
regs = [r"[@#].+? ", r"http.+? "]
labels = list(zip("happy,sad,disgust,angry,surprise,fear".split(","), "😊 😢 🤢 😠 😮 😨".split(" ")))
with open("tweets.txt", "r") as f:
    for line in f:
        line = line.split("[:::]")[0]
        x = ' '.join(line.strip().split()[5:])+" "
        for r in regs:
            x = re.sub(r, ' ', x)
        x = ' '.join([m.dictionary_form() for m in tokenizer_obj.tokenize(x, mode)]).strip()
        y = {lab[0]: (lab[1] in x) for lab in labels}
        if "😐" not in x and not any(a for _, a in y.items()):
            continue
        y["text"] = x
        data.append(y)
df = pd.DataFrame(data)
del data
#df.head()

In [6]:
X = df["text"]
vect = TfidfVectorizer(min_df=3, max_df=0.3).fit(X)
X_vec = vect.transform(X)
models = []
for c in df.columns[:-1]:
    y = df[c]
    inds1 = np.where(y==True)[0]
    inds2 = np.random.choice(np.where(y==False)[0], len(inds1))
    X_sel = svstack([X_vec[inds1], X_vec[inds2]])
    y_sel = np.array(y[inds1].tolist() + y[inds2].tolist())
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y_sel, shuffle=True, test_size=0.01)
    clf = LogisticRegression(solver="liblinear", penalty="l1").fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    models.append((c, clf))

              precision    recall  f1-score   support

       False       0.79      0.81      0.80    111690
        True       0.80      0.78      0.79    111648

    accuracy                           0.80    223338
   macro avg       0.80      0.80      0.80    223338
weighted avg       0.80      0.80      0.80    223338

              precision    recall  f1-score   support

       False       0.76      0.78      0.77     23326
        True       0.78      0.76      0.77     23505

    accuracy                           0.77     46831
   macro avg       0.77      0.77      0.77     46831
weighted avg       0.77      0.77      0.77     46831

              precision    recall  f1-score   support

       False       0.79      0.74      0.77       720
        True       0.76      0.81      0.78       723

    accuracy                           0.77      1443
   macro avg       0.78      0.77      0.77      1443
weighted avg       0.78      0.77      0.77      1443

              preci

In [11]:
import pickle
with open("model.pkl", "wb") as f:
    pickle.dump((vect, tuple(models)), f)

In [18]:
def tok(x):
    return ' '.join([m.dictionary_form() for m in tokenizer_obj.tokenize(x, mode)]).strip()

texts = [
    "ティファってチョーかわいいなー",
    "エアリスが死んで悲しいよ",
    "クラウドって自称ソルジャーとかキモいね",
    "バレットって叫んでばっかでうるさくてムカつく",
    "セフィロスチョー強くてびっくり",
    "タークスとかいう闇組織こわ",]
v = vect.transform(tok(x) for x in texts)
for n, m in models:
    print(n, m.predict_proba(v)[:,1])

happy [0.4149349  0.01118172 0.26394603 0.08769217 0.27527495 0.17970833]
sad [0.35991318 0.98267576 0.38108967 0.25645773 0.56397691 0.50454481]
disgust [0.59940007 0.64030534 0.95440502 0.85891457 0.59698875 0.69558706]
angry [0.63244043 0.32554099 0.75003635 0.99825019 0.57824366 0.59089631]
surprise [0.73755418 0.23817178 0.71688511 0.68124337 0.90511871 0.68201801]
fear [0.60314765 0.56247    0.8494008  0.39528605 0.8639368  0.87915443]
