In [15]:
import json
import random
import os
import torch
import numpy as np
import polars as pl
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
DATA_PATH = "data"
SEED = 42
N_FOLD = 3

# path setting
EXP_NAME = "e-tfidf-wo-pca"
MODEL_NAME = "tfidf"
COMPETITION_NAME = "automated_essay_scoring"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME.split('/')[-1]}"
MODEL_OUTPUT_PATH = f"trained_models/{EXP_NAME}"

UPLOAD_DATA_TO_KAGGLE = True

In [17]:
def validate_dataset_name(dataset_name: str) -> None:
    if len(dataset_name) < 6 or len(dataset_name) > 50:
        raise Exception(
            f"データセットの文字列は6~50文字にしてください。現在{len(DATASET_NAME)}文字"
        )
    if "_" in dataset_name:
        raise Exception("datasetの名称に_の使用は禁止です")


validate_dataset_name(DATASET_NAME)

In [18]:
def resolve_path(base_path: str) -> str:
    import os

    cwd = os.getcwd()
    print(cwd)
    if cwd == f"/notebooks":
        print("Jupyter Kernel By VSCode!")
        return f"/notebooks/{COMPETITION_NAME}/{base_path}"
    elif cwd == f"/notebooks/{COMPETITION_NAME}":
        print("nohup!")
        return base_path
    elif cwd == f"/notebooks/{COMPETITION_NAME}/{COMPETITION_NAME}/exp":
        print("Jupyter Lab!")
        return f"../../{base_path}"
    elif cwd.startswith("/Users"):
        print("Local Mac!")
        return f"../../{base_path}"
    else:
        raise Exception("Unknown environment")


DATA_PATH = resolve_path(DATA_PATH)
print(DATA_PATH)
MODEL_OUTPUT_PATH = resolve_path(MODEL_OUTPUT_PATH)
print(MODEL_OUTPUT_PATH)

/Users/shinichiro.saito/automated_essay_scoring/automated_essay_scoring/eda
Local Mac!
../../data
/Users/shinichiro.saito/automated_essay_scoring/automated_essay_scoring/eda
Local Mac!
../../trained_models/e-tfidf-wo-pca


In [19]:
import os

os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)

In [20]:
# Seed the same seed to all
def seed_everything(seed: int) -> None:
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

In [21]:
train = pl.read_csv(f"{DATA_PATH}/train.csv")

In [22]:
with open(f"{DATA_PATH}/essay_id_fold_by_s_sl_g_p_only_train_dict.json") as f:
    essay_id_fold_only_train = json.load(f)

train = train.with_columns(
    pl.col("essay_id")
    .replace(essay_id_fold_only_train, return_dtype=pl.Int64)
    .alias("fold")
)

In [23]:
train.head()

essay_id,full_text,score,fold
str,str,i64,i64
"""000d118""","""Many people have car where the…",3,0
"""000fe60""","""I am a scientist at NASA that …",3,0
"""001ab80""","""People always wish they had th…",4,1
"""001bdc0""","""We all heard about Venus, the …",4,0
"""002ba53""","""Dear, State Senator This is a…",3,2


In [24]:
vectorizer = TfidfVectorizer(
    strip_accents="unicode",
    analyzer="word",
    ngram_range=(1, 3),
    min_df=0.05,
    max_df=0.95,
    sublinear_tf=True,
)
vectorizer.fit(train["full_text"])
all_voc = vectorizer.vocabulary_

In [25]:
len(all_voc)

1203

In [26]:
from sklearn.decomposition import PCA
import pickle


oofs: list[pd.DataFrame] = []
# Cross Validationによる学習の実施
for fold in range(N_FOLD):
    print(f"Start fold {fold}")

    # foldごとにtrainとvalidに分ける
    train_fold = train.filter(pl.col("fold") != fold)
    valid_fold = train.filter(pl.col("fold") == fold)

    # TfidfVectorizer parameter
    vectorizer = TfidfVectorizer(
        strip_accents="unicode",
        analyzer="word",
        ngram_range=(1, 3),
        min_df=0.05,
        max_df=0.95,
        sublinear_tf=True,
        vocabulary=all_voc,
    )

    vectorizer.fit(train_fold["full_text"])
    valid_tfid = vectorizer.transform(valid_fold["full_text"])

    dense_matrix = valid_tfid.toarray()

    df = pd.DataFrame(
        dense_matrix,
        columns=[f"tfidf_{i}" for i in range(len(all_voc))],
    )

    df["essay_id"] = valid_fold["essay_id"]

    oofs.append(df)

    # save vectorizer
    with open(
        f"{MODEL_OUTPUT_PATH}/tfidf_vectorizer_s_sl_g_p_fold{fold}.pkl", "wb"
    ) as file:
        pickle.dump(vectorizer, file)

all_tfidf_res = pd.concat(oofs)

Start fold 0
Start fold 1
Start fold 2


In [27]:
# oof_tfidf = pd.DataFrame(
#     all_tfidf_res,
#     columns=[f"tfidf_{i}" for i in range(100)],
# )
# oof_tfidf["essay_id"] = all_tfidf_res["essay_id"].tolist()

In [28]:
all_tfidf_res.head()

Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_1194,tfidf_1195,tfidf_1196,tfidf_1197,tfidf_1198,tfidf_1199,tfidf_1200,tfidf_1201,tfidf_1202,essay_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031682,000d118
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035858,...,0.0,0.0,0.069853,0.081853,0.0,0.0,0.0,0.0,0.0,000fe60
2,0.0,0.0,0.150417,0.161345,0.0,0.0,0.095293,0.0,0.0,0.032677,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,001bdc0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038582,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0036253
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0047cb3


In [29]:
all_tfidf_res.to_csv(f"{MODEL_OUTPUT_PATH}/oof_tfidf.csv", index=False)

# Kaggleへのアップロード

In [30]:
if UPLOAD_DATA_TO_KAGGLE:
    import os
    import json

    from kaggle.api.kaggle_api_extended import KaggleApi

    def dataset_create_new(dataset_name: str, upload_dir: str):
        # if "_" in dataset_name:
        #     raise ValueError("datasetの名称に_の使用は禁止です")
        dataset_metadata = {}
        dataset_metadata["id"] = f"sinchir0/{dataset_name}"
        dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
        dataset_metadata["title"] = dataset_name
        with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
            json.dump(dataset_metadata, f, indent=4)
        api = KaggleApi()
        api.authenticate()
        api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")

    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

Create Dataset name:e-tfidf-wo-pca-tfidf, output_dir:../../trained_models/e-tfidf-wo-pca
Starting upload for file tfidf_vectorizer_s_sl_g_p_fold2.pkl


100%|██████████| 43.5k/43.5k [00:00<00:00, 58.2kB/s]


Upload successful: tfidf_vectorizer_s_sl_g_p_fold2.pkl (43KB)
Starting upload for file tfidf_vectorizer_s_sl_g_p_fold0.pkl


100%|██████████| 43.5k/43.5k [00:00<00:00, 58.8kB/s]


Upload successful: tfidf_vectorizer_s_sl_g_p_fold0.pkl (43KB)
Starting upload for file tfidf_vectorizer_s_sl_g_p_fold1.pkl


100%|██████████| 43.5k/43.5k [00:00<00:00, 57.5kB/s]


Upload successful: tfidf_vectorizer_s_sl_g_p_fold1.pkl (43KB)
Starting upload for file oof_tfidf.csv


100%|██████████| 122M/122M [00:10<00:00, 12.2MB/s] 


Upload successful: oof_tfidf.csv (122MB)
