In [1]:
import polars as pl

DATA_PATH = "../../data/"

train = pl.read_csv(DATA_PATH + "train.csv")
predicted_prompt = pl.read_csv(DATA_PATH + "predicted_prompt.csv")

In [2]:
train

essay_id,full_text,score
str,str,i64
"""000d118""","""Many people have car where the…",3
"""000fe60""","""I am a scientist at NASA that …",3
"""001ab80""","""People always wish they had th…",4
"""001bdc0""","""We all heard about Venus, the …",4
"""002ba53""","""Dear, State Senator This is a…",3
…,…,…
"""ffd378d""","""the story "" The Challenge of E…",2
"""ffddf1f""","""Technology has changed a lot o…",4
"""fff016d""","""If you don't like sitting arou…",2
"""fffb49b""","""In ""The Challenge of Exporing …",1


In [3]:
train = train.join(  # prompt_nameを付与する
    predicted_prompt.select(pl.col(["essay_id", "prompt_name"])),
    how="left",
    on="essay_id",
)

In [4]:
from datasets import load_dataset

# 追加データ
persuade_dataset = (
    load_dataset(
        "csv",
        data_files={"train": f"{DATA_PATH}/persuade_w_is_tr_con_as_num.csv"},
        split="train",
    )
    .filter(lambda x: not x["is_train_contains"])
    .select_columns(
        ["essay_id_comp", "full_text", "holistic_essay_score", "prompt_name"]
    )
    .rename_columns({"essay_id_comp": "essay_id", "holistic_essay_score": "score"})
)

persuade_df = pl.DataFrame(persuade_dataset.to_pandas())

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# train = pl.concat([train, persuade_df])

train = train.with_columns(  # prompt_nameをlabel encodingする
    pl.col("prompt_name").cast(pl.Categorical).to_physical().alias("prompt_id")
)
persuade_df = persuade_df.with_columns(  # prompt_nameをlabel encodingする
    pl.col("prompt_name").cast(pl.Categorical).to_physical().alias("prompt_id")
)

In [6]:
pl.Config.set_tbl_rows(100)

polars.config.Config

In [7]:
train = train.with_columns(
    pl.col("full_text")
    .map_elements(lambda x: len(x.split()), return_dtype=pl.Int64)
    .alias("word_length"),
)

persuade_df = persuade_df.with_columns(
    pl.col("full_text")
    .map_elements(lambda x: len(x.split()), return_dtype=pl.Int64)
    .alias("word_length"),
)

In [8]:
train = train.with_columns(
    pl.when(pl.col("word_length") <= 200)
    .then(pl.lit("x<=200"))
    .when(pl.col("word_length") <= 400)
    .then(pl.lit("200<x<=400"))
    .when(pl.col("word_length") <= 600)
    .then(pl.lit("400<x<=600"))
    .when(pl.col("word_length") <= 800)
    .then(pl.lit("600<x<=800"))
    .when(pl.col("word_length") <= 1000)
    .then(pl.lit("800<x<=1000"))
    .when(pl.col("word_length") <= 1200)
    .then(pl.lit("1000<x<=1200"))
    .when(pl.col("word_length") > 1200)
    .then(pl.lit("x>1200"))
    .alias("word_length_cat"),
)

persuade_df = persuade_df.with_columns(
    pl.when(pl.col("word_length") <= 200)
    .then(pl.lit("x<=200"))
    .when(pl.col("word_length") <= 400)
    .then(pl.lit("200<x<=400"))
    .when(pl.col("word_length") <= 600)
    .then(pl.lit("400<x<=600"))
    .when(pl.col("word_length") <= 800)
    .then(pl.lit("600<x<=800"))
    .when(pl.col("word_length") <= 1000)
    .then(pl.lit("800<x<=1000"))
    .when(pl.col("word_length") <= 1200)
    .then(pl.lit("1000<x<=1200"))
    .when(pl.col("word_length") > 1200)
    .then(pl.lit("x>1200"))
    .alias("word_length_cat"),
)

In [9]:
train = train.with_columns(
    pl.concat_str(
        pl.col("score"),
        pl.col("word_length_cat"),
        separator="_",
    ).alias("score_word_length_cat")
).with_columns(
    pl.concat_str(
        pl.col("score_word_length_cat"),
        pl.col("prompt_id"),
        separator="_",
    ).alias("concat_score_word_length_prompt_id")
)

persuade_df = persuade_df.with_columns(
    pl.concat_str(
        pl.col("score"),
        pl.col("word_length_cat"),
        separator="_",
    ).alias("score_word_length_cat")
).with_columns(
    pl.concat_str(
        pl.col("score_word_length_cat"),
        pl.col("prompt_id"),
        separator="_",
    ).alias("concat_score_word_length_prompt_id")
)

In [10]:
train.head()

essay_id,full_text,score,prompt_name,prompt_id,word_length,word_length_cat,score_word_length_cat,concat_score_word_length_prompt_id
str,str,i64,str,u32,i64,str,str,str
"""000d118""","""Many people have car where the…",3,"""Car-free cities""",0,498,"""400<x<=600""","""3_400<x<=600""","""3_400<x<=600_0"""
"""000fe60""","""I am a scientist at NASA that …",3,"""The Face on Mars""",1,332,"""200<x<=400""","""3_200<x<=400""","""3_200<x<=400_1"""
"""001ab80""","""People always wish they had th…",4,"""Driverless cars""",2,550,"""400<x<=600""","""4_400<x<=600""","""4_400<x<=600_2"""
"""001bdc0""","""We all heard about Venus, the …",4,"""Exploring Venus""",3,451,"""400<x<=600""","""4_400<x<=600""","""4_400<x<=600_3"""
"""002ba53""","""Dear, State Senator This is a…",3,"""Does the electoral college wor…",4,373,"""200<x<=400""","""3_200<x<=400""","""3_200<x<=400_4"""


In [11]:
persuade_df.head()

essay_id,full_text,score,prompt_name,prompt_id,word_length,word_length_cat,score_word_length_cat,concat_score_word_length_prompt_id
str,str,i64,str,u32,i64,str,str,str
"""423A1CA112E2""","""Phones Modern humans today ar…",3,"""Phones and driving""",0,379,"""200<x<=400""","""3_200<x<=400""","""3_200<x<=400_0"""
"""BC75783F96E3""","""This essay will explain if dri…",4,"""Phones and driving""",0,366,"""200<x<=400""","""4_200<x<=400""","""4_200<x<=400_0"""
"""74C8BC7417DE""","""Driving while the use of cellu…",2,"""Phones and driving""",0,178,"""x<=200""","""2_x<=200""","""2_x<=200_0"""
"""A8445CABFECE""","""Phones & Driving Drivers shou…",3,"""Phones and driving""",0,212,"""200<x<=400""","""3_200<x<=400""","""3_200<x<=400_0"""
"""6B4F7A0165B9""","""Cell Phone Operation While Dri…",4,"""Phones and driving""",0,332,"""200<x<=400""","""4_200<x<=400""","""4_200<x<=400_0"""


In [12]:
(
    train.select(
        pl.col("concat_score_word_length_prompt_id").value_counts(),
    )
    .unnest("concat_score_word_length_prompt_id")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
)

concat_score_word_length_prompt_id,count,percentage
str,u32,f64
"""3_200<x<=400_2""",954,0.055122
"""3_200<x<=400_5""",910,0.05258
"""4_400<x<=600_2""",779,0.045011
"""3_200<x<=400_3""",737,0.042584
"""3_200<x<=400_1""",617,0.03565
"""2_200<x<=400_3""",576,0.033281
"""3_200<x<=400_6""",512,0.029583
"""2_200<x<=400_2""",511,0.029526
"""2_200<x<=400_5""",504,0.029121
"""2_200<x<=400_6""",439,0.025365


In [13]:
(
    persuade_df.select(
        pl.col("concat_score_word_length_prompt_id").value_counts(),
    )
    .unnest("concat_score_word_length_prompt_id")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
)

concat_score_word_length_prompt_id,count,percentage
str,u32,f64
"""3_200<x<=400_6""",667,0.038539
"""3_200<x<=400_5""",550,0.031779
"""3_200<x<=400_4""",528,0.030508
"""4_400<x<=600_8""",466,0.026926
"""4_400<x<=600_7""",425,0.024557
"""4_400<x<=600_3""",424,0.024499
"""4_400<x<=600_2""",345,0.019934
"""5_600<x<=800_7""",335,0.019356
"""5_600<x<=800_2""",333,0.019241
"""3_200<x<=400_0""",303,0.017507


In [14]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

fold_arr = np.zeros(train.height)
sgkf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

for idx, (_, val_idx) in enumerate(
    sgkf.split(train, train["concat_score_word_length_prompt_id"])
):
    fold_arr[val_idx] = idx

train = train.with_columns(pl.Series(fold_arr).cast(pl.Int64).alias("fold"))



In [15]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

fold_arr = np.zeros(persuade_df.height)
sgkf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

for idx, (_, val_idx) in enumerate(
    sgkf.split(persuade_df, persuade_df["concat_score_word_length_prompt_id"])
):
    fold_arr[val_idx] = idx

fold_arr += 3  # trainのfoldと被らないようにずらす
persuade_df = persuade_df.with_columns(pl.Series(fold_arr).cast(pl.Int64).alias("fold"))



In [16]:
persuade_df.head(10)

essay_id,full_text,score,prompt_name,prompt_id,word_length,word_length_cat,score_word_length_cat,concat_score_word_length_prompt_id,fold
str,str,i64,str,u32,i64,str,str,str,i64
"""423A1CA112E2""","""Phones Modern humans today ar…",3,"""Phones and driving""",0,379,"""200<x<=400""","""3_200<x<=400""","""3_200<x<=400_0""",4
"""BC75783F96E3""","""This essay will explain if dri…",4,"""Phones and driving""",0,366,"""200<x<=400""","""4_200<x<=400""","""4_200<x<=400_0""",4
"""74C8BC7417DE""","""Driving while the use of cellu…",2,"""Phones and driving""",0,178,"""x<=200""","""2_x<=200""","""2_x<=200_0""",5
"""A8445CABFECE""","""Phones & Driving Drivers shou…",3,"""Phones and driving""",0,212,"""200<x<=400""","""3_200<x<=400""","""3_200<x<=400_0""",5
"""6B4F7A0165B9""","""Cell Phone Operation While Dri…",4,"""Phones and driving""",0,332,"""200<x<=400""","""4_200<x<=400""","""4_200<x<=400_0""",4
"""97C1CFD04E4B""","""Cell phone use should not be l…",4,"""Phones and driving""",0,492,"""400<x<=600""","""4_400<x<=600""","""4_400<x<=600_0""",5
"""2CE1FE38D0E7""","""Phones and Driving Driving is…",5,"""Phones and driving""",0,604,"""600<x<=800""","""5_600<x<=800""","""5_600<x<=800_0""",3
"""30A8FB981469""","""PHONES AND DRIVING In this wo…",4,"""Phones and driving""",0,469,"""400<x<=600""","""4_400<x<=600""","""4_400<x<=600_0""",4
"""E05C7F5C1156""","""People are debating whether if…",4,"""Phones and driving""",0,592,"""400<x<=600""","""4_400<x<=600""","""4_400<x<=600_0""",4
"""50B3435E475B""","""Texting and driving Over half…",4,"""Phones and driving""",0,353,"""200<x<=400""","""4_200<x<=400""","""4_200<x<=400_0""",3


In [17]:
essay_id_fold_dict = dict(zip(train["essay_id"], train["fold"]))

import json

with open("essay_id_fold_by_slp_only_tr_dict.json", "w") as f:
    # slt = score length topic
    json.dump(essay_id_fold_dict, f)

In [18]:
essay_id_fold_dict = dict(zip(persuade_df["essay_id"], persuade_df["fold"]))

import json

with open("essay_id_fold_by_slp_only_persuade_dict.json", "w") as f:
    # slt = score length topic
    json.dump(essay_id_fold_dict, f)

# Check

In [19]:
(
    train.filter(pl.col("fold") == 0)
    .select(
        pl.col("concat_score_word_length_prompt_id").value_counts(),
    )
    .unnest("concat_score_word_length_prompt_id")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
    .head(10)
)

concat_score_word_length_prompt_id,count,percentage
str,u32,f64
"""3_200<x<=400_2""",318,0.018374
"""3_200<x<=400_5""",304,0.017565
"""4_400<x<=600_2""",260,0.015023
"""3_200<x<=400_3""",246,0.014214
"""3_200<x<=400_1""",206,0.011903
"""2_200<x<=400_3""",192,0.011094
"""3_200<x<=400_6""",171,0.00988
"""2_200<x<=400_2""",170,0.009823
"""2_200<x<=400_5""",168,0.009707
"""2_200<x<=400_6""",147,0.008494


In [20]:
(
    train.filter(pl.col("fold") == 1)
    .select(
        pl.col("concat_score_word_length_prompt_id").value_counts(),
    )
    .unnest("concat_score_word_length_prompt_id")
    .with_columns((pl.col("count") / train.height).alias("percentage"))
    .sort("count", descending=True)
    .head(10)
)

concat_score_word_length_prompt_id,count,percentage
str,u32,f64
"""3_200<x<=400_2""",318,0.018374
"""3_200<x<=400_5""",303,0.017507
"""4_400<x<=600_2""",259,0.014965
"""3_200<x<=400_3""",246,0.014214
"""3_200<x<=400_1""",206,0.011903
"""2_200<x<=400_3""",192,0.011094
"""3_200<x<=400_6""",171,0.00988
"""2_200<x<=400_2""",170,0.009823
"""2_200<x<=400_5""",168,0.009707
"""3_200<x<=400_4""",146,0.008436


In [29]:
(
    persuade_df.filter(pl.col("fold") == 3)
    .select(
        pl.col("concat_score_word_length_prompt_id").value_counts(),
    )
    .unnest("concat_score_word_length_prompt_id")
    .with_columns((pl.col("count") / persuade_df.height).alias("percentage"))
    .sort("count", descending=True)
    .head(10)
)

concat_score_word_length_prompt_id,count,percentage
str,u32,f64
"""3_200<x<=400_6""",222,0.016914
"""3_200<x<=400_5""",184,0.014019
"""3_200<x<=400_4""",176,0.01341
"""4_400<x<=600_8""",155,0.01181
"""4_400<x<=600_7""",142,0.010819
"""4_400<x<=600_3""",141,0.010743
"""4_400<x<=600_2""",115,0.008762
"""5_600<x<=800_7""",112,0.008533
"""5_600<x<=800_2""",111,0.008457
"""3_200<x<=400_0""",101,0.007695


In [28]:
(
    persuade_df.filter(pl.col("fold") == 4)
    .select(
        pl.col("concat_score_word_length_prompt_id").value_counts(),
    )
    .unnest("concat_score_word_length_prompt_id")
    .with_columns((pl.col("count") / persuade_df.height).alias("percentage"))
    .sort("count", descending=True)
    .head(10)
)

concat_score_word_length_prompt_id,count,percentage
str,u32,f64
"""3_200<x<=400_6""",222,0.016914
"""3_200<x<=400_5""",183,0.013943
"""3_200<x<=400_4""",176,0.01341
"""4_400<x<=600_8""",155,0.01181
"""4_400<x<=600_3""",142,0.010819
"""4_400<x<=600_7""",141,0.010743
"""4_400<x<=600_2""",115,0.008762
"""5_600<x<=800_7""",111,0.008457
"""5_600<x<=800_2""",111,0.008457
"""3_200<x<=400_0""",101,0.007695


In [23]:
# (
#     train.filter(pl.col("fold") == 0)
#     .select(
#         pl.col("score").value_counts(),
#     )
#     .unnest("score")
#     .with_columns((pl.col("count") / train.height).alias("percentage"))
#     .sort("count", descending=True)
# )

In [24]:
# (
#     train.filter(pl.col("fold") == 0)
#     .select(
#         pl.col("word_length_cat").value_counts(),
#     )
#     .unnest("word_length_cat")
#     .with_columns((pl.col("count") / train.height).alias("percentage"))
#     .sort("count", descending=True)
# )

In [25]:
# (
#     train.filter(pl.col("fold") == 1)
#     .select(
#         pl.col("word_length_cat").value_counts(),
#     )
#     .unnest("word_length_cat")
#     .with_columns((pl.col("count") / train.height).alias("percentage"))
#     .sort("count", descending=True)
# )