In [1]:
EXP_NAME = "e-char-tfidf-slp"
MODEL_NAME = "tfidf"

In [2]:
import json
import random
import os
import torch
import numpy as np
import polars as pl
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
DATA_PATH = "data"
SEED = 42
N_FOLD = 3

# path setting
COMPETITION_NAME = "automated_essay_scoring"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME.split('/')[-1]}"
MODEL_OUTPUT_PATH = f"trained_models/{EXP_NAME}"

UPLOAD_DATA_TO_KAGGLE = True

In [4]:
def validate_dataset_name(dataset_name: str) -> None:
    if len(dataset_name) < 6 or len(dataset_name) > 50:
        raise Exception(
            f"データセットの文字列は6~50文字にしてください。現在{len(DATASET_NAME)}文字"
        )
    if "_" in dataset_name:
        raise Exception("datasetの名称に_の使用は禁止です")


validate_dataset_name(DATASET_NAME)

In [5]:
def resolve_path(base_path: str) -> str:
    import os

    cwd = os.getcwd()
    print(cwd)
    if cwd == f"/notebooks":
        print("Jupyter Kernel By VSCode!")
        return f"/notebooks/{COMPETITION_NAME}/{base_path}"
    elif cwd == f"/notebooks/{COMPETITION_NAME}":
        print("nohup!")
        return base_path
    elif cwd == f"/notebooks/{COMPETITION_NAME}/{COMPETITION_NAME}/exp":
        print("Jupyter Lab!")
        return f"../../{base_path}"
    elif cwd.startswith("/Users"):
        print("Local Mac!")
        return f"../../{base_path}"
    else:
        raise Exception("Unknown environment")


DATA_PATH = resolve_path(DATA_PATH)
print(DATA_PATH)
MODEL_OUTPUT_PATH = resolve_path(MODEL_OUTPUT_PATH)
print(MODEL_OUTPUT_PATH)

/Users/shinichiro.saito/automated_essay_scoring/automated_essay_scoring/eda
Local Mac!
../../data
/Users/shinichiro.saito/automated_essay_scoring/automated_essay_scoring/eda
Local Mac!
../../trained_models/e-char-tfidf-slp


In [6]:
import os

os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)

In [7]:
# Seed the same seed to all
def seed_everything(seed: int) -> None:
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

In [8]:
train = pl.read_csv(f"{DATA_PATH}/train.csv")

In [9]:
with open(f"{DATA_PATH}/essay_id_fold_by_slp_only_train_dict.json") as f:
    essay_id_fold_only_train = json.load(f)

train = train.with_columns(
    pl.col("essay_id")
    .replace(essay_id_fold_only_train, return_dtype=pl.Int64)
    .alias("fold")
)

In [10]:
train.head()

essay_id,full_text,score,fold
str,str,i64,i64
"""000d118""","""Many people have car where the…",3,0
"""000fe60""","""I am a scientist at NASA that …",3,2
"""001ab80""","""People always wish they had th…",4,2
"""001bdc0""","""We all heard about Venus, the …",4,2
"""002ba53""","""Dear, State Senator This is a…",3,1


In [11]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    strip_accents="unicode",
    analyzer="word",
    ngram_range=(3, 6),
    min_df=0.05,
    max_df=0.95,
    sublinear_tf=True,
)
vectorizer.fit(train["full_text"])
all_voc = vectorizer.vocabulary_

In [12]:
len(all_voc)

19627

In [13]:
from sklearn.decomposition import PCA
import pickle


oofs: list[pd.DataFrame] = []
# Cross Validationによる学習の実施
for fold in range(N_FOLD):
    print(f"Start fold {fold}")

    # foldごとにtrainとvalidに分ける
    train_fold = train.filter(pl.col("fold") != fold)
    valid_fold = train.filter(pl.col("fold") == fold)

    def tfidf_tokenizer(x):
        return x

    def tfidf_preprocessor(x):
        return x

    # TfidfVectorizer parameter
    vectorizer = TfidfVectorizer(
        tokenizer=tfidf_tokenizer,
        preprocessor=tfidf_preprocessor,
        token_pattern=None,
        strip_accents="unicode",
        analyzer="word",
        ngram_range=(3, 6),
        min_df=0.05,
        max_df=0.95,
        sublinear_tf=True,
        vocabulary=all_voc,
    )

    vectorizer.fit(train_fold["full_text"])
    valid_tfid = vectorizer.transform(valid_fold["full_text"])

    dense_matrix = valid_tfid.toarray()

    df = pd.DataFrame(
        dense_matrix,
        columns=[f"tfidf_{i}" for i in range(len(all_voc))],
    )

    df["essay_id"] = valid_fold["essay_id"]

    oofs.append(df)

    # save vectorizer
    with open(
        f"{MODEL_OUTPUT_PATH}/tfidf_vectorizer_s_sl_g_p_fold{fold}.pkl", "wb"
    ) as file:
        pickle.dump(vectorizer, file)

all_tfidf_res = pd.concat(oofs)

# SVD : 次元削減
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
all_tfidf_res_reduced = svd.fit_transform(
    all_tfidf_res[[f"tfidf_{i}" for i in range(len(all_voc))]]
)

Start fold 0




Start fold 1




Start fold 2




In [14]:
oof_tfidf = pd.DataFrame(
    all_tfidf_res_reduced,
    columns=[f"tfidf_{i}" for i in range(100)],
)
oof_tfidf["essay_id"] = all_tfidf_res["essay_id"].tolist()

In [15]:
oof_tfidf.head()

Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_91,tfidf_92,tfidf_93,tfidf_94,tfidf_95,tfidf_96,tfidf_97,tfidf_98,tfidf_99,essay_id
0,0.480761,-0.069906,-0.029684,-0.124607,-0.025792,0.052792,0.20036,0.129328,0.035904,0.003206,...,0.027629,-0.07055,-0.024856,-0.00487,0.0298,-0.023274,0.024487,0.052111,-0.044755,000d118
1,0.444141,-0.222407,0.314096,0.053528,-0.018858,-0.040603,0.038249,-0.062817,-0.034133,-0.008763,...,-0.020301,0.003739,-0.043478,0.019699,0.009077,0.029467,-0.036117,-0.007297,-0.064364,0030e86
2,0.342362,-0.046622,-0.147968,0.244566,-0.012355,-0.011902,-0.012998,-0.006787,-0.2213,-0.024329,...,-0.008213,0.006503,-0.011601,-0.022842,0.007145,0.059649,-0.029824,-0.032984,-0.004266,0033037
3,0.384236,-0.070743,-0.070861,-0.077383,0.086623,0.516206,-0.114914,-0.098427,0.015945,-0.005567,...,0.000737,0.00936,-0.014728,-0.006816,0.008347,0.000849,-0.006443,-0.018661,-0.020186,0033bf4
4,0.379743,-0.048274,-0.117454,-0.088526,-0.040347,-0.076423,-0.109063,0.073225,0.028899,-0.038056,...,0.030268,-0.036217,0.027628,0.027753,0.012181,-0.019016,0.02367,-0.006825,-0.043504,004229b


In [16]:
oof_tfidf.to_csv(f"{MODEL_OUTPUT_PATH}/oof_tfidf.csv", index=False)

# Kaggleへのアップロード

In [17]:
if UPLOAD_DATA_TO_KAGGLE:
    import os
    import json

    from kaggle.api.kaggle_api_extended import KaggleApi

    def dataset_create_new(dataset_name: str, upload_dir: str):
        # if "_" in dataset_name:
        #     raise ValueError("datasetの名称に_の使用は禁止です")
        dataset_metadata = {}
        dataset_metadata["id"] = f"sinchir0/{dataset_name}"
        dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
        dataset_metadata["title"] = dataset_name
        with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
            json.dump(dataset_metadata, f, indent=4)
        api = KaggleApi()
        api.authenticate()
        api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")

    print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
    dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

Create Dataset name:e-char-tfidf-slp-tfidf, output_dir:../../trained_models/e-char-tfidf-slp
Starting upload for file tfidf_vectorizer_s_sl_g_p_fold2.pkl


100%|██████████| 737k/737k [00:01<00:00, 584kB/s] 


Upload successful: tfidf_vectorizer_s_sl_g_p_fold2.pkl (737KB)
Starting upload for file tfidf_vectorizer_s_sl_g_p_fold0.pkl


100%|██████████| 737k/737k [00:01<00:00, 629kB/s] 


Upload successful: tfidf_vectorizer_s_sl_g_p_fold0.pkl (737KB)
Starting upload for file tfidf_vectorizer_s_sl_g_p_fold1.pkl


100%|██████████| 737k/737k [00:01<00:00, 571kB/s] 


Upload successful: tfidf_vectorizer_s_sl_g_p_fold1.pkl (737KB)
Starting upload for file oof_tfidf.csv


100%|██████████| 34.9M/34.9M [00:02<00:00, 13.1MB/s]


Upload successful: oof_tfidf.csv (35MB)
