In [1]:
EXP_NAME = "e-word-tfidf"
MODEL_NAME = "tfidf"

In [2]:
import json
import random
import os
import torch
import numpy as np
import polars as pl
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
DATA_PATH = "data"
SEED = 42
N_FOLD = 3

# path setting
COMPETITION_NAME = "automated_essay_scoring"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME.split('/')[-1]}"
MODEL_OUTPUT_PATH = f"trained_models/{EXP_NAME}"

UPLOAD_DATA_TO_KAGGLE = True

In [4]:
def validate_dataset_name(dataset_name: str) -> None:
    if len(dataset_name) < 6 or len(dataset_name) > 50:
        raise Exception(
            f"データセットの文字列は6~50文字にしてください。現在{len(DATASET_NAME)}文字"
        )
    if "_" in dataset_name:
        raise Exception("datasetの名称に_の使用は禁止です")


validate_dataset_name(DATASET_NAME)

In [5]:
def resolve_path(base_path: str) -> str:
    import os

    cwd = os.getcwd()
    print(cwd)
    if cwd == f"/notebooks":
        print("Jupyter Kernel By VSCode!")
        return f"/notebooks/{COMPETITION_NAME}/{base_path}"
    elif cwd == f"/notebooks/{COMPETITION_NAME}":
        print("nohup!")
        return base_path
    elif cwd == f"/notebooks/{COMPETITION_NAME}/{COMPETITION_NAME}/exp":
        print("Jupyter Lab!")
        return f"../../{base_path}"
    elif cwd.startswith("/Users"):
        print("Local Mac!")
        return f"../../{base_path}"
    else:
        raise Exception("Unknown environment")


DATA_PATH = resolve_path(DATA_PATH)
print(DATA_PATH)
MODEL_OUTPUT_PATH = resolve_path(MODEL_OUTPUT_PATH)
print(MODEL_OUTPUT_PATH)

/Users/shinichiro.saito/automated_essay_scoring/automated_essay_scoring/eda
Local Mac!
../../data
/Users/shinichiro.saito/automated_essay_scoring/automated_essay_scoring/eda
Local Mac!
../../trained_models/e-word-tfidf


In [6]:
import os

os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)

In [7]:
# Seed the same seed to all
def seed_everything(seed: int) -> None:
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

In [8]:
train = pl.read_csv(f"{DATA_PATH}/train.csv")

In [9]:
with open(f"{DATA_PATH}/essay_id_fold_by_s_sl_g_p_only_train_dict.json") as f:
    essay_id_fold_only_train = json.load(f)

train = train.with_columns(
    pl.col("essay_id")
    .replace(essay_id_fold_only_train, return_dtype=pl.Int64)
    .alias("fold")
)

In [10]:
train.head()

essay_id,full_text,score,fold
str,str,i64,i64
"""000d118""","""Many people have car where the…",3,0
"""000fe60""","""I am a scientist at NASA that …",3,0
"""001ab80""","""People always wish they had th…",4,1
"""001bdc0""","""We all heard about Venus, the …",4,0
"""002ba53""","""Dear, State Senator This is a…",3,2


In [17]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    strip_accents="unicode",
    analyzer="word",
    ngram_range=(3, 6),
    min_df=0.05,
    max_df=0.95,
    sublinear_tf=True,
)
vectorizer.fit(train["full_text"])
all_voc = vectorizer.vocabulary_

In [19]:
len(all_voc)

19627

In [28]:
from sklearn.decomposition import PCA
import pickle


oofs: list[pd.DataFrame] = []
# Cross Validationによる学習の実施
for fold in range(N_FOLD):
    print(f"Start fold {fold}")

    # foldごとにtrainとvalidに分ける
    train_fold = train.filter(pl.col("fold") != fold)
    valid_fold = train.filter(pl.col("fold") == fold)

    def tfidf_tokenizer(x):
        return x

    def tfidf_preprocessor(x):
        return x

    # TfidfVectorizer parameter
    vectorizer = TfidfVectorizer(
        tokenizer=tfidf_tokenizer,
        preprocessor=tfidf_preprocessor,
        token_pattern=None,
        strip_accents="unicode",
        analyzer="word",
        ngram_range=(3, 6),
        min_df=0.05,
        max_df=0.95,
        sublinear_tf=True,
        vocabulary=all_voc,
    )

    vectorizer.fit(train_fold["full_text"])
    valid_tfid = vectorizer.transform(valid_fold["full_text"])

    dense_matrix = valid_tfid.toarray()

    df = pd.DataFrame(
        dense_matrix,
        columns=[f"tfidf_{i}" for i in range(len(all_voc))],
    )

    df["essay_id"] = valid_fold["essay_id"]

    oofs.append(df)

    # save vectorizer
    with open(
        f"{MODEL_OUTPUT_PATH}/tfidf_vectorizer_s_sl_g_p_fold{fold}.pkl", "wb"
    ) as file:
        pickle.dump(vectorizer, file)

all_tfidf_res = pd.concat(oofs)

# SVD : 次元削減
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
all_tfidf_res_reduced = svd.fit_transform(
    all_tfidf_res[[f"tfidf_{i}" for i in range(len(all_voc))]]
)

Start fold 0




Start fold 1




Start fold 2




In [31]:
oof_tfidf = pd.DataFrame(
    all_tfidf_res_reduced,
    columns=[f"tfidf_{i}" for i in range(100)],
)
oof_tfidf["essay_id"] = all_tfidf_res["essay_id"].tolist()

In [32]:
oof_tfidf.head()

Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_91,tfidf_92,tfidf_93,tfidf_94,tfidf_95,tfidf_96,tfidf_97,tfidf_98,tfidf_99,essay_id
0,0.42834,-0.039073,-0.040121,-0.04782,0.024961,0.053059,0.302992,-0.056375,0.063701,0.0168,...,-0.010025,0.014342,-0.00254,0.014464,0.012424,-0.014837,0.003919,-0.041749,0.040429,000d118
1,0.356387,0.159323,0.009374,0.007485,0.395414,-0.125089,-0.04973,-0.042228,0.030181,-0.008561,...,0.0137,0.03383,-0.000301,0.038543,-0.017799,-0.002648,0.017269,-0.018821,0.018258,000fe60
2,0.421714,0.556962,0.114038,-0.019595,-0.19506,0.011409,-0.023142,0.032614,-0.004093,0.147141,...,0.015431,0.042424,-0.017514,0.00848,-0.000546,0.011427,-0.038294,-0.02308,-0.012667,001bdc0
3,0.3773,0.51857,0.120942,-0.020053,-0.211645,0.022247,-0.047556,0.005338,-0.016251,-0.146453,...,-0.004665,0.00904,-0.021966,-0.035676,0.003183,0.004906,0.016478,-0.034712,0.015269,0036253
4,0.287526,-0.045324,-0.053092,-0.061103,0.024412,0.037702,0.495741,0.069833,-0.046911,-0.018658,...,0.064636,0.068888,-0.018376,0.007812,0.022137,-0.010695,-0.035521,-0.00732,0.035616,0047cb3


In [33]:
oof_tfidf.to_csv(f"{MODEL_OUTPUT_PATH}/oof_tfidf.csv", index=False)

# Kaggleへのアップロード

In [34]:
if UPLOAD_DATA_TO_KAGGLE:
    import os
    import json

    from kaggle.api.kaggle_api_extended import KaggleApi

    def dataset_create_new(dataset_name: str, upload_dir: str):
        # if "_" in dataset_name:
        #     raise ValueError("datasetの名称に_の使用は禁止です")
        dataset_metadata = {}
        dataset_metadata["id"] = f"sinchir0/{dataset_name}"
        dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
        dataset_metadata["title"] = dataset_name
        with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
            json.dump(dataset_metadata, f, indent=4)
        api = KaggleApi()
        api.authenticate()
        api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")

    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

Create Dataset name:e-word-tfidf-tfidf, output_dir:../../trained_models/e-word-tfidf
Starting upload for file tfidf_vectorizer_s_sl_g_p_fold2.pkl


100%|██████████| 737k/737k [00:01<00:00, 573kB/s] 


Upload successful: tfidf_vectorizer_s_sl_g_p_fold2.pkl (737KB)
Starting upload for file tfidf_vectorizer_s_sl_g_p_fold0.pkl


100%|██████████| 737k/737k [00:01<00:00, 563kB/s] 


Upload successful: tfidf_vectorizer_s_sl_g_p_fold0.pkl (737KB)
Starting upload for file tfidf_vectorizer_s_sl_g_p_fold1.pkl


100%|██████████| 737k/737k [00:01<00:00, 590kB/s] 


Upload successful: tfidf_vectorizer_s_sl_g_p_fold1.pkl (737KB)
Starting upload for file oof_tfidf.csv


100%|██████████| 35.0M/35.0M [00:02<00:00, 12.8MB/s]


Upload successful: oof_tfidf.csv (35MB)
