In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from hydra import compose, initialize
from matplotlib import pyplot as plt
from matplotlib_venn import venn2

pl.Config.set_fmt_str_lengths(100)


def draw_venn2(train, test, rows, columns, x_size, y_size):
    fig, axes = plt.subplots(rows, columns, figsize=(x_size, y_size))
    column_names = test.columns
    for col, ax in zip(column_names, axes.ravel(), strict=False):
        venn2(
            subsets=(set(train[col].dropna().unique()), set(test[col].dropna().unique())),
            set_labels=("train", "test"),
            ax=ax,
        )
        ax.set_title(col)
    fig.tight_layout()


with initialize(config_path="config", version_base=None):
    cfg = compose(config_name="config")
    cfg.exp_number = Path().resolve().name


In [26]:
train_df = pl.read_csv(cfg.path.train, try_parse_dates=True)
test_df = pl.read_csv(cfg.path.test, try_parse_dates=True)


In [3]:
print("■ train")
display(train_df.head())
print("■ test")
display(test_df.head())


In [2]:
print("■ train")
display(train_df.describe())
print(train_df.dtypes)
print(f"行数: {train_df.shape[0]}, 列数: {train_df.shape[1]}")

print("\n■ test")
display(test_df.describe())
print(test_df.dtypes)
print(f"行数: {test_df.shape[0]}, 列数: {test_df.shape[1]}")
print(set(train_df.columns) - set(test_df.columns))
