In [1]:
import polars as pl

DATA_PATH = "../../data/"

train = pl.read_csv(DATA_PATH + "train.csv")
predicted_prompt = pl.read_csv(DATA_PATH + "predicted_prompt.csv")

In [2]:
train

essay_id,full_text,score
str,str,i64
"""000d118""","""Many people have car where the…",3
"""000fe60""","""I am a scientist at NASA that …",3
"""001ab80""","""People always wish they had th…",4
"""001bdc0""","""We all heard about Venus, the …",4
"""002ba53""","""Dear, State Senator This is a…",3
…,…,…
"""ffd378d""","""the story "" The Challenge of E…",2
"""ffddf1f""","""Technology has changed a lot o…",4
"""fff016d""","""If you don't like sitting arou…",2
"""fffb49b""","""In ""The Challenge of Exporing …",1


In [3]:
train = train.join(  # prompt_nameを付与する
    predicted_prompt.select(pl.col(["essay_id", "prompt_name"])),
    how="left",
    on="essay_id",
)

In [4]:
from datasets import load_dataset

# 追加データ
persuade_dataset = (
    load_dataset(
        "csv",
        data_files={"train": f"{DATA_PATH}/persuade_w_is_tr_con_as_num.csv"},
        split="train",
    )
    .filter(lambda x: not x["is_train_contains"])
    .select_columns(
        ["essay_id_comp", "full_text", "holistic_essay_score", "prompt_name"]
    )
    .rename_columns({"essay_id_comp": "essay_id", "holistic_essay_score": "score"})
)

persuade_df = pl.DataFrame(persuade_dataset.to_pandas())

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
train = pl.concat([train, persuade_df])

In [6]:
train = train.with_columns(  # prompt_nameをlabel encodingする
    pl.col("prompt_name").cast(pl.Categorical).to_physical().alias("prompt_id")
)

In [7]:
train = train.with_columns(
    pl.col("full_text")
    .map_elements(lambda x: len(x.split()), return_dtype=pl.Int64)
    .alias("word_length"),
)

In [8]:
train = train.with_columns(
    pl.when(pl.col("word_length") <= 200)
    .then(pl.lit("x<=200"))
    .when(pl.col("word_length") <= 400)
    .then(pl.lit("200<x<=400"))
    .when(pl.col("word_length") <= 600)
    .then(pl.lit("400<x<=600"))
    .when(pl.col("word_length") <= 800)
    .then(pl.lit("600<x<=800"))
    .when(pl.col("word_length") <= 1000)
    .then(pl.lit("800<x<=1000"))
    .when(pl.col("word_length") <= 1200)
    .then(pl.lit("1000<x<=1200"))
    .when(pl.col("word_length") > 1200)
    .then(pl.lit("x>1200"))
    .alias("word_length_cat"),
)

In [9]:
train = train.with_columns(
    pl.concat_str(
        pl.col("score"),
        pl.col("word_length_cat"),
        separator="_",
    ).alias("score_word_length_cat")
)
# ).with_columns(
#     pl.concat_str(
#         pl.col("score_word_length_cat"),
#         pl.col("prompt_id"),
#         separator="_",
#     ).alias("concat_score_word_length_prompt_id")
# )

In [10]:
train.head(10)

essay_id,full_text,score,prompt_name,prompt_id,word_length,word_length_cat,score_word_length_cat
str,str,i64,str,u32,i64,str,str
"""000d118""","""Many people have car where the…",3,"""Car-free cities""",0,498,"""400<x<=600""","""3_400<x<=600"""
"""000fe60""","""I am a scientist at NASA that …",3,"""The Face on Mars""",1,332,"""200<x<=400""","""3_200<x<=400"""
"""001ab80""","""People always wish they had th…",4,"""Driverless cars""",2,550,"""400<x<=600""","""4_400<x<=600"""
"""001bdc0""","""We all heard about Venus, the …",4,"""Exploring Venus""",3,451,"""400<x<=600""","""4_400<x<=600"""
"""002ba53""","""Dear, State Senator This is a…",3,"""Does the electoral college wor…",4,373,"""200<x<=400""","""3_200<x<=400"""
"""0030e86""","""If I were to choose between ke…",4,"""Does the electoral college wor…",4,400,"""200<x<=400""","""4_200<x<=400"""
"""0033037""","""The posibilty of a face reconi…",2,"""Facial action coding system""",5,179,"""x<=200""","""2_x<=200"""
"""0033bf4""","""What is the Seagoing Cowboys p…",3,"""""A Cowboy Who Rode the Waves""""",6,353,"""200<x<=400""","""3_200<x<=400"""
"""0036253""","""The challenge of exploring Ven…",2,"""Exploring Venus""",3,310,"""200<x<=400""","""2_200<x<=400"""
"""0040e27""","""There are many reasons why you…",3,"""""A Cowboy Who Rode the Waves""""",6,280,"""200<x<=400""","""3_200<x<=400"""


In [11]:
# (
#     train.select(
#         pl.col("concat_score_word_length_prompt_id").value_counts(),
#     )
#     .unnest("concat_score_word_length_prompt_id")
#     .with_columns((pl.col("count") / train.height).alias("percentage"))
#     .sort("count", descending=True)
# )

In [12]:
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

fold_arr = np.zeros(train.height)
sgkf = StratifiedGroupKFold(n_splits=3, random_state=42, shuffle=True)

for idx, (_, val_idx) in enumerate(
    sgkf.split(train, train["score_word_length_cat"], train["prompt_id"])
):
    fold_arr[val_idx] = idx

train = train.with_columns(pl.Series(fold_arr).cast(pl.Int64).alias("fold"))



In [13]:
train.head(10)

essay_id,full_text,score,prompt_name,prompt_id,word_length,word_length_cat,score_word_length_cat,fold
str,str,i64,str,u32,i64,str,str,i64
"""000d118""","""Many people have car where the…",3,"""Car-free cities""",0,498,"""400<x<=600""","""3_400<x<=600""",0
"""000fe60""","""I am a scientist at NASA that …",3,"""The Face on Mars""",1,332,"""200<x<=400""","""3_200<x<=400""",0
"""001ab80""","""People always wish they had th…",4,"""Driverless cars""",2,550,"""400<x<=600""","""4_400<x<=600""",0
"""001bdc0""","""We all heard about Venus, the …",4,"""Exploring Venus""",3,451,"""400<x<=600""","""4_400<x<=600""",1
"""002ba53""","""Dear, State Senator This is a…",3,"""Does the electoral college wor…",4,373,"""200<x<=400""","""3_200<x<=400""",1
"""0030e86""","""If I were to choose between ke…",4,"""Does the electoral college wor…",4,400,"""200<x<=400""","""4_200<x<=400""",1
"""0033037""","""The posibilty of a face reconi…",2,"""Facial action coding system""",5,179,"""x<=200""","""2_x<=200""",2
"""0033bf4""","""What is the Seagoing Cowboys p…",3,"""""A Cowboy Who Rode the Waves""""",6,353,"""200<x<=400""","""3_200<x<=400""",0
"""0036253""","""The challenge of exploring Ven…",2,"""Exploring Venus""",3,310,"""200<x<=400""","""2_200<x<=400""",1
"""0040e27""","""There are many reasons why you…",3,"""""A Cowboy Who Rode the Waves""""",6,280,"""200<x<=400""","""3_200<x<=400""",0


In [14]:
essay_id_fold_dict = dict(zip(train["essay_id"], train["fold"]))

import json

with open("essay_id_fold_by_s_sl_g_p_dict.json", "w") as f:
    json.dump(essay_id_fold_dict, f)

# Check

In [15]:
(
    train.filter(pl.col("fold") == 0)
    .select(
        pl.col("score_word_length_cat").value_counts(),
    )
    .unnest("score_word_length_cat")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
    .head(10)
)

score_word_length_cat,count,percentage
str,u32,f64
"""3_200<x<=400""",2743,0.090135
"""4_400<x<=600""",1717,0.056421
"""2_200<x<=400""",1670,0.054876
"""3_400<x<=600""",897,0.029476
"""4_200<x<=400""",731,0.024021
"""2_x<=200""",699,0.022969
"""5_400<x<=600""",310,0.010187
"""5_600<x<=800""",307,0.010088
"""1_200<x<=400""",288,0.009464
"""4_600<x<=800""",251,0.008248


In [16]:
(
    train.filter(pl.col("fold") == 1)
    .select(
        pl.col("score_word_length_cat").value_counts(),
    )
    .unnest("score_word_length_cat")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
    .head(10)
)

score_word_length_cat,count,percentage
str,u32,f64
"""3_200<x<=400""",2499,0.082118
"""2_200<x<=400""",1578,0.051853
"""4_400<x<=600""",1397,0.045906
"""2_x<=200""",861,0.028293
"""4_200<x<=400""",772,0.025368
"""3_400<x<=600""",647,0.021261
"""1_200<x<=400""",340,0.011172
"""5_400<x<=600""",336,0.011041
"""5_600<x<=800""",269,0.008839
"""4_600<x<=800""",240,0.007886


In [17]:
(
    train.filter(pl.col("fold") == 0)
    .select(
        pl.col("prompt_id").value_counts(),
    )
    .unnest("prompt_id")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
)

prompt_id,count,percentage
u32,u32,f64
2,3497,0.114912
1,2093,0.068776
0,1966,0.064603
6,1650,0.054219
7,1168,0.038381


In [18]:
(
    train.filter(pl.col("fold") == 1)
    .select(
        pl.col("prompt_id").value_counts(),
    )
    .unnest("prompt_id")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
)

prompt_id,count,percentage
u32,u32,f64
3,3016,0.099106
4,2046,0.067232
11,1626,0.053431
14,1552,0.050999
10,1542,0.05067


In [19]:
(
    train.filter(pl.col("fold") == 2)
    .select(
        pl.col("prompt_id").value_counts(),
    )
    .unnest("prompt_id")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
)

prompt_id,count,percentage
u32,u32,f64
5,3043,0.099993
13,2157,0.070879
8,1750,0.057505
9,1670,0.054876
12,1656,0.054416
