In [13]:
import pandas as pd

SEED = 42
wrime_df = pd.read_csv("wrime-ver2.tsv", sep="\t")

targets = (wrime_df['Avg. Readers_Joy'].values > 0).astype(int)
user_ids = wrime_df["UserID"].values
sentences = wrime_df["Sentence"].values


In [14]:
from sklearn.model_selection import train_test_split

idx_lst = list(range(len(targets)))

_train_idx, test_idx, _, _ = train_test_split(idx_lst, idx_lst, test_size=0.2, random_state=SEED, stratify=user_ids)
train_idx, valid_idx, _, _ = train_test_split(_train_idx, _train_idx, test_size=0.2, random_state=SEED, stratify=user_ids[_train_idx])

train_texts = sentences[train_idx]
valid_texts = sentences[valid_idx]
test_texts = sentences[test_idx]

y_train = targets[train_idx]
y_valid = targets[valid_idx]
y_test = targets[test_idx]


In [15]:
import MeCab


class WakatiMecab():

    def __init__(self):
        self.m = MeCab.Tagger("-Ochasen")

    def __call__(self, text):
        wakati = [w.split("\t") for w in self.m.parse(text).split("\n")[:-2]]
        return wakati

    def wakati(self, text):
        wakati = self.__call__(text)
        wakati = [w[0] for w in wakati]
        return " ".join(wakati)


In [16]:
import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

wakati_mecab = WakatiMecab()

train_corpus = [wakati_mecab.wakati(s) for s in train_texts]
valid_corpus = [wakati_mecab.wakati(s) for s in valid_texts]
test_corpus = [wakati_mecab.wakati(s) for s in test_texts]

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_corpus)
X_valid = vectorizer.transform(valid_corpus)
X_test = vectorizer.transform(test_corpus)


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, classification_report

for c in [0.1, 0.3, 1.0, 3.0, 10.0]:
    lr = LogisticRegression(C=c, random_state=SEED, n_jobs=-1)
    lr.fit(X_train, y_train)
    y_pred_valid = lr.predict(X_valid)
    valid_acc = accuracy_score(y_valid, y_pred_valid)
    # a = precision_score(y_valid, y_pred_valid)
    # b = recall_score(y_valid, y_pred_valid)
    
    # c = classification_report(y_valid, y_pred_valid)
    
    # print(c)


In [22]:

# ロジスティック回帰での結果
lr = LogisticRegression(C=3, random_state=SEED, n_jobs=-1)
lr.fit(X_train, y_train)

y_pred_test = lr.predict(X_test)
test_acc = accuracy_score(y_test, y_pred_test)
c = classification_report(y_valid, y_pred_valid)
print("ロジスティック回帰での結果")
print(c)

print(f"Test Accuracy = {test_acc}")


ロジスティック回帰での結果
              precision    recall  f1-score   support

           0       0.81      0.89      0.85      3820
           1       0.70      0.54      0.61      1780

    accuracy                           0.78      5600
   macro avg       0.75      0.72      0.73      5600
weighted avg       0.77      0.78      0.77      5600

Test Accuracy = 0.7952857142857143


In [19]:
from sklearn.svm import SVC
# 線形SVMのインスタンスを生成
model = SVC(kernel='linear', random_state=None)
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred_test)

print(f"Test Accuracy = {test_acc}")
c = classification_report(y_test, y_pred_test)
print("線形SVCでの結果")
print(c)




Test Accuracy = 0.7968571428571428
線形SVCでの結果
              precision    recall  f1-score   support

           0       0.81      0.92      0.86      4871
           1       0.75      0.50      0.60      2129

    accuracy                           0.80      7000
   macro avg       0.78      0.71      0.73      7000
weighted avg       0.79      0.80      0.78      7000



In [20]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=1234)
clf.fit(X_train, y_train)
print("score=", clf.score(X_test, y_test))

y_pred_test = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred_test)

print(f"Test Accuracy = {test_acc}")
c = classification_report(y_test, y_pred_test)
print("ランダムフォレストでの結果")
print(c)


score= 0.779
Test Accuracy = 0.779
ランダムフォレストでの結果
              precision    recall  f1-score   support

           0       0.78      0.96      0.86      4871
           1       0.79      0.37      0.51      2129

    accuracy                           0.78      7000
   macro avg       0.78      0.67      0.68      7000
weighted avg       0.78      0.78      0.75      7000



In [21]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train, y_train)
y_pred_test = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_pred_test)

print(f"Test Accuracy = {test_acc}")
c = classification_report(y_test, y_pred_test)
print("ｋでの結果")
print(c)


Test Accuracy = 0.7082857142857143
ｋでの結果
              precision    recall  f1-score   support

           0       0.72      0.96      0.82      4871
           1       0.59      0.14      0.23      2129

    accuracy                           0.71      7000
   macro avg       0.65      0.55      0.52      7000
weighted avg       0.68      0.71      0.64      7000

