# EDA

Predicting Loan Payback (Playground Series - Season 5, Episode 11)

https://www.kaggle.com/competitions/playground-series-s5e11/overview

In [None]:
import sys

sys.path.append("../utils")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from utils import check_df

## データ読み込み

In [None]:
# データ読み込み
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

train.shape, test.shape

## 基本情報の確認

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
check_df(train)

In [None]:
check_df(test)

## 数値型とカテゴリカル型の確認

In [None]:
# 数値型変数を抽出
num_features = train.select_dtypes(include=[np.number]).columns.tolist()
num_features.remove("id")
num_features.remove("accident_risk")
num_features

In [None]:
# カテゴリカル変数を抽出
cat_features = train.select_dtypes(include=["object", "bool"]).columns.tolist()
cat_features

## 目的変数の分布

In [None]:
target_col = "accident_risk"

plt.figure(figsize=(8, 5))
plt.hist(train[target_col], bins=50)
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.boxplot(train[target_col])
plt.grid()
plt.show()

In [None]:
# 正規分布度（PPCC）算出する関数
from scipy import stats


def normality_score_ppcc(x):
    x = np.asarray(x, float)
    x = x[~np.isnan(x)]
    # probplot の戻り値: (osm, osr), (slope, intercept, r)
    _, (slope, intercept, r) = stats.probplot(x, dist="norm")
    return float(r**2)  # 0〜1で、大きいほど正規っぽい


# コール部分
score = normality_score_ppcc(train[target_col])
print("PPCC:", score)
score = normality_score_ppcc(np.log1p(train[target_col]))
print("PPCC (LOG):", score)

対数変換後の分布

In [None]:
plt.figure(figsize=(8, 5))
plt.hist(np.log1p(train[target_col]), bins=50)
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.boxplot(np.log1p(train[target_col]))
plt.grid()
plt.show()

In [None]:
score = normality_score_ppcc(train[target_col])
print("PPCC:", score)

## 数値型変数と目的変数の関係

相関係数

In [None]:
correlation_with_target = (
    train[num_features + [target_col]]
    .corr()[target_col]
    .drop(target_col)
    .sort_values(ascending=False)
)

In [None]:
plt.figure(figsize=(8, 4))
plt.barh(range(len(correlation_with_target)), correlation_with_target.values)
plt.yticks(range(len(correlation_with_target)), correlation_with_target.index)
plt.grid(axis="x")
plt.show()

curvature

In [None]:
plt.figure(figsize=(8, 4))
plt.scatter(train["curvature"], train[target_col], alpha=0.5)
plt.xlabel("curvature")
plt.ylabel(target_col)
plt.grid()
plt.show()

speed_limit

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(x="speed_limit", y=target_col, data=train)
plt.grid()
plt.show()

num_reported_accidents

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(x="num_reported_accidents", y=target_col, data=train)
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(x="num_reported_accidents", data=train)
plt.grid()
plt.show()

num_lanes

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(x="num_lanes", y=target_col, data=train)
plt.grid()
plt.show()

## カテゴリ型変数と目的変数の関係

In [None]:
cat_features

In [None]:
def plot_categorycal_boxplots(data, cat_cols, target_col):
    for col in cat_cols:
        plt.figure(figsize=(8, 4))
        sns.boxplot(x=col, y=target_col, data=data)
        plt.title(f"{col} vs {target_col}")
        plt.grid()
        plt.show()

In [None]:
plot_categorycal_boxplots(train, cat_features, target_col)

In [None]:
def cross_tab_print(
    df: pd.DataFrame, row_col: str, col_col: str, normalize: bool = True
):
    """
    df        : pandas.DataFrame
    row_col   : 行方向に使うカラム名
    col_col   : 列方向に使うカラム名
    normalize : True のとき行方向で割合も表示する
    """
    # 件数ベース
    ct_count = pd.crosstab(df[row_col], df[col_col])
    print("=== count ===")
    print(ct_count)

    if normalize:
        ct_ratio = pd.crosstab(df[row_col], df[col_col], normalize="index")
        print("\n=== ratio (row-normalized) ===")
        print(ct_ratio.round(3))

In [None]:
# 件数＋割合を表示
cross_tab_print(train, "lighting", "time_of_day")
cross_tab_print(train, "lighting", "weather")
cross_tab_print(train, "weather", "time_of_day")

# 件数だけ表示したいとき
# cross_tab_print(df, "lighting", "weather", normalize=False)