In [1]:
import polars as pl

DATA_PATH = "../../data/"

train = pl.read_csv(DATA_PATH + "train.csv")
predicted_prompt = pl.read_csv(DATA_PATH + "predicted_prompt.csv")

In [2]:
train

essay_id,full_text,score
str,str,i64
"""000d118""","""Many people have car where the…",3
"""000fe60""","""I am a scientist at NASA that …",3
"""001ab80""","""People always wish they had th…",4
"""001bdc0""","""We all heard about Venus, the …",4
"""002ba53""","""Dear, State Senator This is a…",3
…,…,…
"""ffd378d""","""the story "" The Challenge of E…",2
"""ffddf1f""","""Technology has changed a lot o…",4
"""fff016d""","""If you don't like sitting arou…",2
"""fffb49b""","""In ""The Challenge of Exporing …",1


In [3]:
train = train.join(  # prompt_nameを付与する
    predicted_prompt.select(pl.col(["essay_id", "prompt_name"])),
    how="left",
    on="essay_id",
)

In [4]:
from datasets import load_dataset

# 追加データ
persuade_dataset = (
    load_dataset(
        "csv",
        data_files={"train": f"{DATA_PATH}/persuade_w_is_tr_con_as_num.csv"},
        split="train",
    )
    .filter(lambda x: not x["is_train_contains"])
    .select_columns(
        ["essay_id_comp", "full_text", "holistic_essay_score", "prompt_name"]
    )
    .rename_columns({"essay_id_comp": "essay_id", "holistic_essay_score": "score"})
)

persuade_df = pl.DataFrame(persuade_dataset.to_pandas())

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# train = pl.concat([train, persuade_df])

In [6]:
train = train.with_columns(  # prompt_nameをlabel encodingする
    pl.col("prompt_name").cast(pl.Categorical).to_physical().alias("prompt_id")
)
persuade_df = persuade_df.with_columns(  # prompt_nameをlabel encodingする
    pl.col("prompt_name").cast(pl.Categorical).to_physical().alias("prompt_id")
)

In [7]:
train = train.with_columns(
    pl.col("full_text")
    .map_elements(lambda x: len(x.split()), return_dtype=pl.Int64)
    .alias("word_length"),
)

persuade_df = persuade_df.with_columns(
    pl.col("full_text")
    .map_elements(lambda x: len(x.split()), return_dtype=pl.Int64)
    .alias("word_length"),
)

In [8]:
train = train.with_columns(
    pl.when(pl.col("word_length") <= 200)
    .then(pl.lit("x<=200"))
    .when(pl.col("word_length") <= 400)
    .then(pl.lit("200<x<=400"))
    .when(pl.col("word_length") <= 600)
    .then(pl.lit("400<x<=600"))
    .when(pl.col("word_length") <= 800)
    .then(pl.lit("600<x<=800"))
    .when(pl.col("word_length") <= 1000)
    .then(pl.lit("800<x<=1000"))
    .when(pl.col("word_length") <= 1200)
    .then(pl.lit("1000<x<=1200"))
    .when(pl.col("word_length") > 1200)
    .then(pl.lit("x>1200"))
    .alias("word_length_cat"),
)

persuade_df = persuade_df.with_columns(
    pl.when(pl.col("word_length") <= 200)
    .then(pl.lit("x<=200"))
    .when(pl.col("word_length") <= 400)
    .then(pl.lit("200<x<=400"))
    .when(pl.col("word_length") <= 600)
    .then(pl.lit("400<x<=600"))
    .when(pl.col("word_length") <= 800)
    .then(pl.lit("600<x<=800"))
    .when(pl.col("word_length") <= 1000)
    .then(pl.lit("800<x<=1000"))
    .when(pl.col("word_length") <= 1200)
    .then(pl.lit("1000<x<=1200"))
    .when(pl.col("word_length") > 1200)
    .then(pl.lit("x>1200"))
    .alias("word_length_cat"),
)

In [9]:
train = train.with_columns(
    pl.concat_str(
        pl.col("score"),
        pl.col("word_length_cat"),
        separator="_",
    ).alias("score_word_length_cat")
)
# ).with_columns(
#     pl.concat_str(
#         pl.col("score_word_length_cat"),
#         pl.col("prompt_id"),
#         separator="_",
#     ).alias("concat_score_word_length_prompt_id")
# )

persuade_df = persuade_df.with_columns(
    pl.concat_str(
        pl.col("score"),
        pl.col("word_length_cat"),
        separator="_",
    ).alias("score_word_length_cat")
)

In [10]:
train.head(10)

essay_id,full_text,score,prompt_name,prompt_id,word_length,word_length_cat,score_word_length_cat
str,str,i64,str,u32,i64,str,str
"""000d118""","""Many people have car where the…",3,"""Car-free cities""",0,498,"""400<x<=600""","""3_400<x<=600"""
"""000fe60""","""I am a scientist at NASA that …",3,"""The Face on Mars""",1,332,"""200<x<=400""","""3_200<x<=400"""
"""001ab80""","""People always wish they had th…",4,"""Driverless cars""",2,550,"""400<x<=600""","""4_400<x<=600"""
"""001bdc0""","""We all heard about Venus, the …",4,"""Exploring Venus""",3,451,"""400<x<=600""","""4_400<x<=600"""
"""002ba53""","""Dear, State Senator This is a…",3,"""Does the electoral college wor…",4,373,"""200<x<=400""","""3_200<x<=400"""
"""0030e86""","""If I were to choose between ke…",4,"""Does the electoral college wor…",4,400,"""200<x<=400""","""4_200<x<=400"""
"""0033037""","""The posibilty of a face reconi…",2,"""Facial action coding system""",5,179,"""x<=200""","""2_x<=200"""
"""0033bf4""","""What is the Seagoing Cowboys p…",3,"""""A Cowboy Who Rode the Waves""""",6,353,"""200<x<=400""","""3_200<x<=400"""
"""0036253""","""The challenge of exploring Ven…",2,"""Exploring Venus""",3,310,"""200<x<=400""","""2_200<x<=400"""
"""0040e27""","""There are many reasons why you…",3,"""""A Cowboy Who Rode the Waves""""",6,280,"""200<x<=400""","""3_200<x<=400"""


In [11]:
# (
#     train.select(
#         pl.col("concat_score_word_length_prompt_id").value_counts(),
#     )
#     .unnest("concat_score_word_length_prompt_id")
#     .with_columns((pl.col("count") / train.height).alias("percentage"))
#     .sort("count", descending=True)
# )

In [12]:
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

fold_arr = np.zeros(train.height)
sgkf = StratifiedGroupKFold(n_splits=3, random_state=42, shuffle=True)

for idx, (_, val_idx) in enumerate(
    sgkf.split(train, train["score_word_length_cat"], train["prompt_id"])
):
    fold_arr[val_idx] = idx

train = train.with_columns(pl.Series(fold_arr).cast(pl.Int64).alias("fold"))



In [28]:
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

fold_arr = np.zeros(persuade_df.height)
sgkf = StratifiedGroupKFold(n_splits=3, random_state=42, shuffle=True)

for idx, (_, val_idx) in enumerate(
    sgkf.split(
        persuade_df, persuade_df["score_word_length_cat"], persuade_df["prompt_id"]
    )
):
    fold_arr[val_idx] = idx

fold_arr += 3  # trainのfoldと被らないようにずらす
persuade_df = persuade_df.with_columns(pl.Series(fold_arr).cast(pl.Int64).alias("fold"))



In [30]:
train.head(3)

essay_id,full_text,score,prompt_name,prompt_id,word_length,word_length_cat,score_word_length_cat,fold
str,str,i64,str,u32,i64,str,str,i64
"""000d118""","""Many people have car where the…",3,"""Car-free cities""",0,498,"""400<x<=600""","""3_400<x<=600""",0
"""000fe60""","""I am a scientist at NASA that …",3,"""The Face on Mars""",1,332,"""200<x<=400""","""3_200<x<=400""",0
"""001ab80""","""People always wish they had th…",4,"""Driverless cars""",2,550,"""400<x<=600""","""4_400<x<=600""",1


In [31]:
persuade_df.head(3)

essay_id,full_text,score,prompt_name,prompt_id,word_length,word_length_cat,score_word_length_cat,fold
str,str,i64,str,u32,i64,str,str,i64
"""423A1CA112E2""","""Phones Modern humans today ar…",3,"""Phones and driving""",0,379,"""200<x<=400""","""3_200<x<=400""",3
"""BC75783F96E3""","""This essay will explain if dri…",4,"""Phones and driving""",0,366,"""200<x<=400""","""4_200<x<=400""",3
"""74C8BC7417DE""","""Driving while the use of cellu…",2,"""Phones and driving""",0,178,"""x<=200""","""2_x<=200""",3


In [20]:
essay_id_fold_dict = dict(zip(train["essay_id"], train["fold"]))

import json

with open("essay_id_fold_by_s_sl_g_p_only_train_dict.json", "w") as f:
    json.dump(essay_id_fold_dict, f)

In [21]:
essay_id_fold_dict = dict(zip(persuade_df["essay_id"], persuade_df["fold"]))

import json

with open("essay_id_fold_by_s_sl_g_p_only_persuade_dict.json", "w") as f:
    json.dump(essay_id_fold_dict, f)

# Check

In [22]:
(
    train.filter(pl.col("fold") == 0)
    .select(
        pl.col("score_word_length_cat").value_counts(),
    )
    .unnest("score_word_length_cat")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
    .head(10)
)

score_word_length_cat,count,percentage
str,u32,f64
"""3_200<x<=400""",1710,0.098804
"""2_200<x<=400""",1269,0.073323
"""4_400<x<=600""",948,0.054776
"""3_400<x<=600""",674,0.038944
"""2_x<=200""",513,0.029641
"""4_200<x<=400""",366,0.021148
"""1_200<x<=400""",355,0.020512
"""1_x<=200""",212,0.012249
"""5_400<x<=600""",191,0.011036
"""4_600<x<=800""",180,0.0104


In [23]:
(
    train.filter(pl.col("fold") == 1)
    .select(
        pl.col("score_word_length_cat").value_counts(),
    )
    .unnest("score_word_length_cat")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
    .head(10)
)

score_word_length_cat,count,percentage
str,u32,f64
"""3_200<x<=400""",1466,0.084706
"""2_200<x<=400""",950,0.054891
"""4_400<x<=600""",843,0.048709
"""3_400<x<=600""",440,0.025423
"""2_x<=200""",407,0.023516
"""4_200<x<=400""",349,0.020165
"""1_200<x<=400""",134,0.007743
"""5_600<x<=800""",123,0.007107
"""4_600<x<=800""",99,0.00572
"""5_400<x<=600""",88,0.005085


In [24]:
(
    train.filter(pl.col("fold") == 0)
    .select(
        pl.col("prompt_id").value_counts(),
    )
    .unnest("prompt_id")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
)

prompt_id,count,percentage
u32,u32,f64
3,3016,0.174265
1,2093,0.120934
0,1962,0.113365


In [25]:
(
    train.filter(pl.col("fold") == 1)
    .select(
        pl.col("prompt_id").value_counts(),
    )
    .unnest("prompt_id")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
)

prompt_id,count,percentage
u32,u32,f64
2,3497,0.202057
6,1650,0.095337


In [26]:
(
    train.filter(pl.col("fold") == 2)
    .select(
        pl.col("prompt_id").value_counts(),
    )
    .unnest("prompt_id")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
)

prompt_id,count,percentage
u32,u32,f64
5,3043,0.175825
4,2046,0.118218


In [33]:
(
    persuade_df.filter(pl.col("fold") == 3)
    .select(
        pl.col("prompt_id").value_counts(),
    )
    .unnest("prompt_id")
    .with_columns((pl.col("count") / persuade_df.height).alias("percentage"))
    .sort("count", descending=True)
)

prompt_id,count,percentage
u32,u32,f64
8,1552,0.118248
0,1168,0.08899
1,4,0.000305


In [34]:
(
    persuade_df.filter(pl.col("fold") == 4)
    .select(
        pl.col("prompt_id").value_counts(),
    )
    .unnest("prompt_id")
    .with_columns((pl.col("count") / persuade_df.height).alias("percentage"))
    .sort("count", descending=True)
)

prompt_id,count,percentage
u32,u32,f64
7,2157,0.164343
2,1750,0.133333
5,1626,0.123886
