In [56]:
import json
import random
import os
import torch
import numpy as np
import polars as pl
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
DATA_PATH = "data"
SEED = 42
N_FOLD = 3

# path setting
EXP_NAME = "e-tfidf"
MODEL_NAME = "tfidf"
COMPETITION_NAME = "automated_essay_scoring"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME.split('/')[-1]}"
MODEL_OUTPUT_PATH = f"trained_models/{EXP_NAME}"

UPLOAD_DATA_TO_KAGGLE = True

In [58]:
def validate_dataset_name(dataset_name: str) -> None:
    if len(dataset_name) < 6 or len(dataset_name) > 50:
        raise Exception(
            f"データセットの文字列は6~50文字にしてください。現在{len(DATASET_NAME)}文字"
        )
    if "_" in dataset_name:
        raise Exception("datasetの名称に_の使用は禁止です")


validate_dataset_name(DATASET_NAME)

In [59]:
def resolve_path(base_path: str) -> str:
    import os

    cwd = os.getcwd()
    print(cwd)
    if cwd == f"/notebooks":
        print("Jupyter Kernel By VSCode!")
        return f"/notebooks/{COMPETITION_NAME}/{base_path}"
    elif cwd == f"/notebooks/{COMPETITION_NAME}":
        print("nohup!")
        return base_path
    elif cwd == f"/notebooks/{COMPETITION_NAME}/{COMPETITION_NAME}/exp":
        print("Jupyter Lab!")
        return f"../../{base_path}"
    elif cwd.startswith("/Users"):
        print("Local Mac!")
        return f"../../{base_path}"
    else:
        raise Exception("Unknown environment")


DATA_PATH = resolve_path(DATA_PATH)
print(DATA_PATH)
MODEL_OUTPUT_PATH = resolve_path(MODEL_OUTPUT_PATH)
print(MODEL_OUTPUT_PATH)

/Users/shinichiro.saito/automated_essay_scoring/automated_essay_scoring/eda
Local Mac!
../../data
/Users/shinichiro.saito/automated_essay_scoring/automated_essay_scoring/eda
Local Mac!
../../trained_models/e-tfidf


In [60]:
import os

os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)

In [30]:
# Seed the same seed to all
def seed_everything(seed: int) -> None:
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

In [31]:
train = pl.read_csv(f"{DATA_PATH}/train.csv")

In [32]:
with open(f"{DATA_PATH}/essay_id_fold_by_s_sl_g_p_only_train_dict.json") as f:
    essay_id_fold_only_train = json.load(f)

train = train.with_columns(
    pl.col("essay_id")
    .replace(essay_id_fold_only_train, return_dtype=pl.Int64)
    .alias("fold")
)

In [33]:
train.head()

essay_id,full_text,score,fold
str,str,i64,i64
"""000d118""","""Many people have car where the…",3,0
"""000fe60""","""I am a scientist at NASA that …",3,0
"""001ab80""","""People always wish they had th…",4,1
"""001bdc0""","""We all heard about Venus, the …",4,0
"""002ba53""","""Dear, State Senator This is a…",3,2


In [80]:
vectorizer = TfidfVectorizer(
    strip_accents="unicode",
    analyzer="word",
    ngram_range=(3, 6),
    min_df=0.05,
    max_df=0.95,
    sublinear_tf=True,
)
vectorizer.fit(train["full_text"])
all_voc = vectorizer.vocabulary_

In [81]:
len(all_voc)

68

In [82]:
from sklearn.decomposition import PCA
import pickle


oofs: list[pd.DataFrame] = []
# Cross Validationによる学習の実施
for fold in range(N_FOLD):
    print(f"Start fold {fold}")

    # foldごとにtrainとvalidに分ける
    train_fold = train.filter(pl.col("fold") != fold)
    valid_fold = train.filter(pl.col("fold") == fold)

    # TfidfVectorizer parameter
    vectorizer = TfidfVectorizer(
        strip_accents="unicode",
        analyzer="word",
        ngram_range=(1, 3),
        min_df=0.05,
        max_df=0.95,
        sublinear_tf=True,
        vocabulary=all_voc,
    )

    vectorizer.fit(train_fold["full_text"])
    valid_tfid = vectorizer.transform(valid_fold["full_text"])

    dense_matrix = valid_tfid.toarray()

    df = pd.DataFrame(
        dense_matrix,
        columns=[f"tfidf_{i}" for i in range(len(all_voc))],
    )

    df["essay_id"] = valid_fold["essay_id"]

    oofs.append(df)

    # save vectorizer
    with open(
        f"{MODEL_OUTPUT_PATH}/tfidf_vectorizer_s_sl_g_p_fold{fold}.pkl", "wb"
    ) as file:
        pickle.dump(vectorizer, file)

all_tfidf_res = pd.concat(oofs)
pca = PCA(n_components=100)
all_tfidf_reduced = pca.fit_transform(
    all_tfidf_res[[f"tfidf_{i}" for i in range(len(all_voc))]]
)

Start fold 0
Start fold 1
Start fold 2


In [50]:
oof_tfidf = pd.DataFrame(
    all_tfidf_reduced,
    columns=[f"tfidf_{i}" for i in range(100)],
)
oof_tfidf["essay_id"] = all_tfidf_res["essay_id"].tolist()

In [52]:
oof_tfidf.head()

Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_91,tfidf_92,tfidf_93,tfidf_94,tfidf_95,tfidf_96,tfidf_97,tfidf_98,tfidf_99,essay_id
0,0.071303,-0.025168,0.026312,0.007992,0.144536,0.282579,-0.068989,-0.116552,-0.100728,-0.024354,...,0.046455,0.00793,0.014329,-0.029865,-0.013112,0.014016,-0.037306,-0.032094,0.001985,000d118
1,-0.213989,-0.098673,0.163686,0.542009,-0.137247,-0.045957,-0.005828,-0.009804,-0.020199,-0.03083,...,0.031225,0.022761,-0.00954,-0.051266,-0.026955,0.012087,0.019717,0.019481,-0.064898,000fe60
2,-0.46295,-0.268511,-0.28586,-0.171814,-0.050769,-0.026806,0.017007,0.015919,0.00795,-0.01629,...,-0.001518,0.006182,-0.000406,0.021521,0.010319,0.00214,-0.031396,-0.034834,-0.042466,001bdc0
3,-0.474875,-0.268998,-0.308659,-0.218077,-0.05614,-0.064449,0.03562,0.049717,0.10884,0.115138,...,-0.004571,-0.047324,-0.013683,-0.017371,0.047397,0.002733,0.043167,-0.007941,-0.02017,0036253
4,0.060296,-0.026,0.037222,0.010112,0.182746,0.57202,0.067592,0.079728,0.06142,0.027516,...,0.001411,0.02242,-0.009882,-0.013467,0.033988,-0.031986,0.017821,-0.005743,0.024899,0047cb3


In [53]:
oof_tfidf.to_csv(f"{MODEL_OUTPUT_PATH}/oof_tfidf.csv", index=False)

# Kaggleへのアップロード

In [61]:
if UPLOAD_DATA_TO_KAGGLE:
    import os
    import json

    from kaggle.api.kaggle_api_extended import KaggleApi

    def dataset_create_new(dataset_name: str, upload_dir: str):
        # if "_" in dataset_name:
        #     raise ValueError("datasetの名称に_の使用は禁止です")
        dataset_metadata = {}
        dataset_metadata["id"] = f"sinchir0/{dataset_name}"
        dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
        dataset_metadata["title"] = dataset_name
        with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
            json.dump(dataset_metadata, f, indent=4)
        api = KaggleApi()
        api.authenticate()
        api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")

    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

Create Dataset name:e-tfidf-tfidf, output_dir:../../trained_models/e-tfidf
Starting upload for file tfidf_vectorizer_s_sl_g_p_fold2.pkl


100%|██████████| 43.5k/43.5k [00:00<00:00, 57.1kB/s]


Upload successful: tfidf_vectorizer_s_sl_g_p_fold2.pkl (43KB)
Starting upload for file tfidf_vectorizer_s_sl_g_p_fold0.pkl


100%|██████████| 43.5k/43.5k [00:00<00:00, 56.3kB/s]


Upload successful: tfidf_vectorizer_s_sl_g_p_fold0.pkl (43KB)
Starting upload for file tfidf_vectorizer_s_sl_g_p_fold1.pkl


100%|██████████| 43.5k/43.5k [00:00<00:00, 58.6kB/s]


Upload successful: tfidf_vectorizer_s_sl_g_p_fold1.pkl (43KB)
Starting upload for file oof_tfidf.csv


100%|██████████| 35.0M/35.0M [00:02<00:00, 12.6MB/s]


Upload successful: oof_tfidf.csv (35MB)
