In [5]:
import numpy as np
import pandas as pd

from dotenv import load_dotenv
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from langchain.embeddings.huggingface import HuggingFaceEmbeddings


load_dotenv()

True

# 01 Prepare data

- NHK program list text and genres (too few)
- MARC-ja (not available)
- JCoLA https://github.com/osekilab/JCoLA <-- we are using this

# 02 Featrue engineering

In [2]:
data_path = "../local/data/JCoLA-main/data/jcola-v1.0/in_domain_train-v1.0.tsv"
df = pd.read_csv(data_path, sep="\t")

# Text feature extraction
model_name = "oshizo/sbert-jsnli-luke-japanese-base-lite"
# model_name = "studio-ousia/luke-japanese-base-lite"
encoder = HuggingFaceEmbeddings(model_name=model_name)

text_features = ["sentence"]

for i in text_features:
    embeddings = pd.DataFrame(
        encoder.embed_documents(df[i]), index=df.index
    ).add_prefix(f"embed_{i}_")
    df = df.join(embeddings, how="left")


In [3]:
# Choose training features
training_features = [i for i in df.columns if "embed_" in i]
target_label = "label"
df = df[training_features + [target_label]]
df.head(1)

Unnamed: 0,embed_sentence_0,embed_sentence_1,embed_sentence_2,embed_sentence_3,embed_sentence_4,embed_sentence_5,embed_sentence_6,embed_sentence_7,embed_sentence_8,embed_sentence_9,...,embed_sentence_759,embed_sentence_760,embed_sentence_761,embed_sentence_762,embed_sentence_763,embed_sentence_764,embed_sentence_765,embed_sentence_766,embed_sentence_767,label
0,0.779722,-0.159863,0.390812,0.163809,0.441645,-0.422382,0.294479,0.004554,-0.151662,0.318913,...,-0.282139,-0.167588,0.326852,-0.147425,0.16059,0.037417,-0.125623,0.472888,-0.110388,1


In [6]:
# train classifier by logitic regression

X = df[training_features]
y = df[target_label]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("train score ", clf.score(X_train, y_train))
print("test score ", clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))


train score  0.8643179765130985
test score  0.8229768786127167
              precision    recall  f1-score   support

           0       0.45      0.18      0.26       236
           1       0.85      0.96      0.90      1148

    accuracy                           0.82      1384
   macro avg       0.65      0.57      0.58      1384
weighted avg       0.78      0.82      0.79      1384

