In [61]:
import json
import random
import os
import torch
import numpy as np
import polars as pl
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
DATA_PATH = "data"
SEED = 42
N_FOLD = 3

# path setting
EXP_NAME = "e-tfidf"
MODEL_NAME = "tfidf"
COMPETITION_NAME = "atmacup17"
DATASET_NAME = f"{EXP_NAME}-{MODEL_NAME.split('/')[-1]}"
MODEL_OUTPUT_PATH = f"trained_models/{EXP_NAME}"

TFIDF_COL = "Title"
ID_COL = "index"

In [63]:
def resolve_path(base_path: str) -> str:
    import os

    cwd = os.getcwd()
    print(cwd)
    if cwd == f"/notebooks":
        print("Jupyter Kernel By VSCode!")
        return "kernel", f"/notebooks/{COMPETITION_NAME}/{base_path}"
    elif cwd == f"/notebooks/{COMPETITION_NAME}":
        print("nohup!")
        return base_path
    elif cwd == f"/notebooks/{COMPETITION_NAME}/{COMPETITION_NAME}/exp":
        print("Jupyter Lab!")
        return "nohup", f"../../{base_path}"
    elif cwd == f"/content":
        print("Google Colab!")
        return "colab", f"/content/drive/MyDrive/Kaggle/{COMPETITION_NAME}/{base_path}"
    elif cwd.startswith("/home/shinichiro.saito"):
        print("GCP!")
        return "GCP", f"/home/shinichiro.saito/{COMPETITION_NAME}/{base_path}"
    else:
        raise Exception("Unknown environment")


ENV_NAME, DATA_PATH = resolve_path(DATA_PATH)
print(DATA_PATH)
_, MODEL_OUTPUT_PATH = resolve_path(MODEL_OUTPUT_PATH)
print(MODEL_OUTPUT_PATH)

/home/shinichiro.saito/atmacup17/eda
GCP!
/home/shinichiro.saito/atmacup17/data
/home/shinichiro.saito/atmacup17/eda
GCP!
/home/shinichiro.saito/atmacup17/trained_models/e-tfidf


In [64]:
import os

os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)

In [65]:
# Seed the same seed to all
def seed_everything(seed: int) -> None:
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

In [66]:
train = (
    pl.read_csv(f"{DATA_PATH}/train_with_index.csv")
    .with_columns(
        pl.col("Title").fill_null(""),
        pl.col("Review Text").fill_null(""),
    )
    .with_columns(
        pl.concat_str(pl.col("Title"), pl.col("Review Text"), separator=" ").alias(
            "full_text"
        )
    )
)

test = (
    pl.read_csv(f"{DATA_PATH}/test.csv")
    .with_columns(
        pl.col("Title").fill_null(""),
        pl.col("Review Text").fill_null(""),
    )
    .with_columns(
        pl.concat_str(pl.col("Title"), pl.col("Review Text"), separator=" ").alias(
            "full_text"
        )
    )
)

In [67]:
test = test.with_row_index()

In [68]:
with open(f"{DATA_PATH}/rec_stratified_fold.json") as f:
    id_fold_only_train = json.load(f)

train = train.with_columns(
    pl.col(ID_COL).replace(id_fold_only_train, return_dtype=pl.Int64).alias("fold")
)

  pl.col(ID_COL).replace(id_fold_only_train, return_dtype=pl.Int64).alias("fold")


In [69]:
train["Rating"].unique()

Rating
i64
1
2
3
4
5


In [70]:
# vectorizer = TfidfVectorizer(
#     strip_accents="unicode",
#     analyzer="word",
#     ngram_range=(3, 6),
#     min_df=0.05,
#     max_df=0.95,
#     sublinear_tf=True,
# )

vectorizer = TfidfVectorizer()
vectorizer.fit(train[TFIDF_COL])
all_voc = vectorizer.vocabulary_

In [71]:
len(all_voc)

2447

In [72]:
from sklearn.decomposition import PCA
import pickle


oofs: list[pd.DataFrame] = []
# Cross Validationによる学習の実施
for fold in range(N_FOLD):
    print(f"Start fold {fold}")

    # foldごとにtrainとvalidに分ける
    train_fold = train.filter(pl.col("fold") != fold)
    valid_fold = train.filter(pl.col("fold") == fold)

    # TfidfVectorizer parameter
    # vectorizer = TfidfVectorizer(
    #     strip_accents="unicode",
    #     analyzer="word",
    #     ngram_range=(1, 3),
    #     min_df=0.05,
    #     max_df=0.95,
    #     sublinear_tf=True,
    #     vocabulary=all_voc,
    # )
    vectorizer = TfidfVectorizer(vocabulary=all_voc)

    vectorizer.fit(train_fold[TFIDF_COL])
    valid_tfidf = vectorizer.transform(valid_fold[TFIDF_COL])

    dense_matrix = valid_tfidf.toarray()

    df = pd.DataFrame(
        dense_matrix,
        columns=[f"{TFIDF_COL}_tfidf_{i}" for i in range(len(all_voc))],
    )

    df[ID_COL] = valid_fold[ID_COL]

    oofs.append(df)

all_tfidf_res = pd.concat(oofs)
pca = PCA(n_components=100)
all_tfidf_reduced = pca.fit_transform(
    all_tfidf_res[[f"{TFIDF_COL}_tfidf_{i}" for i in range(len(all_voc))]]
)

Start fold 0
Start fold 1
Start fold 2


In [73]:
oof_tfidf = pd.DataFrame(
    all_tfidf_reduced,
    columns=[f"{TFIDF_COL}_tfidf_{i}" for i in range(100)],
)
oof_tfidf[ID_COL] = all_tfidf_res[ID_COL].tolist()

In [74]:
oof_tfidf.head()

Unnamed: 0,Title_tfidf_0,Title_tfidf_1,Title_tfidf_2,Title_tfidf_3,Title_tfidf_4,Title_tfidf_5,Title_tfidf_6,Title_tfidf_7,Title_tfidf_8,Title_tfidf_9,...,Title_tfidf_91,Title_tfidf_92,Title_tfidf_93,Title_tfidf_94,Title_tfidf_95,Title_tfidf_96,Title_tfidf_97,Title_tfidf_98,Title_tfidf_99,index
0,-0.02615,-0.022068,0.004145,-0.064562,-0.00208,0.00876,-0.008642,-0.007903,-0.029947,0.011317,...,-0.003279,-0.006077,-0.005026,-0.005633,-0.002952,0.000772,0.000348,0.002716,0.001637,3
1,-0.049247,0.237447,-0.075509,-0.033458,-0.039604,0.000719,-0.037847,0.027696,-0.045762,0.027471,...,-0.005474,0.010191,-0.015156,-0.004113,-0.004691,0.001203,-0.012979,-0.00238,-0.009562,5
2,-0.044785,-0.035575,-0.024698,-0.092679,-0.005374,0.022003,-0.138024,-0.111792,0.009467,-0.134744,...,0.013678,-0.061124,0.01283,-0.014399,-0.030239,-0.00649,-0.041569,0.028901,0.038426,6
3,-0.03165,-0.024324,0.018397,-0.06996,0.001137,0.008967,-0.006986,-0.011815,-0.039622,0.018099,...,0.026465,0.000577,-0.025492,0.015029,0.000679,-0.035093,-0.010017,-0.029224,0.009545,18
4,-0.052737,-0.042169,-0.060577,-0.059577,0.017644,-0.135556,0.166673,0.016514,-0.022593,-0.082288,...,0.000743,-0.001481,0.017361,0.022689,0.018064,0.036652,0.013,0.010501,-0.024459,20


In [75]:
oof_tfidf.sort_values("index").to_csv(
    f"{MODEL_OUTPUT_PATH}/oof_{TFIDF_COL}_tfidf_vec.csv", index=False
)

# Testへの適用

In [76]:
vectorizer = TfidfVectorizer(vocabulary=all_voc)
vectorizer.fit(train[TFIDF_COL])
test_tfidf = vectorizer.transform(test[TFIDF_COL])

dense_matrix = test_tfidf.toarray()

test_tfidf_df = pd.DataFrame(
    dense_matrix,
    columns=[f"{TFIDF_COL}_tfidf_{i}" for i in range(len(all_voc))],
)

In [77]:
test_tfidf_reduced = pca.fit_transform(
    test_tfidf_df[[f"{TFIDF_COL}_tfidf_{i}" for i in range(len(all_voc))]]
)

In [78]:
test_tfidf_reduced_df = pd.DataFrame(
    test_tfidf_reduced,
    columns=[f"{TFIDF_COL}_tfidf_{i}" for i in range(100)],
)
test_tfidf_reduced_df[ID_COL] = test[ID_COL].to_list()

In [79]:
test_tfidf_reduced_df.sort_values("index")

Unnamed: 0,Title_tfidf_0,Title_tfidf_1,Title_tfidf_2,Title_tfidf_3,Title_tfidf_4,Title_tfidf_5,Title_tfidf_6,Title_tfidf_7,Title_tfidf_8,Title_tfidf_9,...,Title_tfidf_91,Title_tfidf_92,Title_tfidf_93,Title_tfidf_94,Title_tfidf_95,Title_tfidf_96,Title_tfidf_97,Title_tfidf_98,Title_tfidf_99,index
0,0.061563,-0.026812,-0.003145,-0.009743,-0.019127,-0.004321,0.059271,-0.060410,-0.027314,-0.081706,...,-0.000832,0.006028,-0.009460,0.002737,0.003664,0.014778,-0.008164,-0.002213,-0.017313,0
1,-0.048787,-0.033996,-0.021912,-0.067961,0.001530,0.023522,-0.071578,-0.137025,-0.025577,0.187138,...,-0.005194,-0.004485,-0.012773,0.007213,0.007298,-0.018427,0.003341,0.004163,0.006051,1
2,0.256739,-0.037343,-0.014257,-0.050712,0.017601,0.003389,-0.039561,0.039672,-0.031075,-0.021484,...,0.015525,0.036804,0.043076,-0.001836,-0.042954,0.085284,-0.018259,0.028008,0.084800,2
3,-0.041304,-0.046674,-0.035419,-0.069591,-0.001434,0.023877,-0.057310,-0.076121,-0.049681,0.058085,...,-0.020490,-0.001468,0.019157,-0.026069,-0.062193,0.006187,-0.070695,0.016666,-0.026088,3
4,-0.026162,-0.036223,-0.035136,-0.042939,0.000381,0.016439,-0.006309,-0.020233,-0.051153,-0.007338,...,-0.003207,-0.000431,0.002242,0.000762,0.002021,-0.003176,-0.010356,-0.005052,-0.007481,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11150,-0.052688,0.208956,0.062601,-0.155796,0.060667,-0.005470,-0.057838,0.022820,-0.063957,-0.009795,...,0.016458,-0.013513,-0.015381,-0.021938,-0.028028,-0.012678,0.015845,-0.020727,0.015343,11150
11151,-0.020280,0.289400,-0.210726,0.276437,-0.440094,0.476223,0.240738,0.367576,0.114112,0.189809,...,0.002833,0.009850,0.004052,0.010755,0.007337,-0.006454,0.007340,-0.002033,0.005321,11151
11152,-0.020280,0.289400,-0.210726,0.276437,-0.440094,0.476223,0.240738,0.367576,0.114112,0.189809,...,0.002833,0.009850,0.004052,0.010755,0.007337,-0.006454,0.007340,-0.002033,0.005321,11152
11153,-0.063376,0.006872,-0.221745,0.249180,0.150470,-0.216104,-0.157934,0.100518,-0.019822,0.027244,...,0.000339,0.029033,-0.106021,-0.041764,0.139501,0.275883,0.048072,-0.182708,0.027091,11153


In [80]:
test_tfidf_reduced_df.sort_values("index").to_csv(
    f"{MODEL_OUTPUT_PATH}/test_{TFIDF_COL}_tfidf_vec.csv"
)