### kaggle タイタニック生存者
```
kaggle competitions download -c titanic
```

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

## Common function

In [None]:
def plot_bar(x_df: pd.Series , y_df:pd.Series):
    if len(x_df.drop_duplicates()) > 100:
        return
    col_name  : str = x_df.name
    if len(x_df.drop_duplicates()) >30:
        plt.figure(figsize=(60 , 10))
    else:
        plt.figure(figsize=(10,5))
    sns.barplot(x=x_df.sort_values() , y=y_df )
    plt.title(f"Survied Rate by {col_name}")
    plt.show()    



### データの読み込み

In [None]:

train_df = pd.read_csv("input/train.csv")
test_df = pd.read_csv("input/test.csv")
train_df.info()
# sns.barplot(x=train_df["Sex"] , y=train_df["Survived"] )
train_df.head()

In [None]:
# train_df.value_counts()

all_col_map = {}
target_col_map = {}
dummie_col_list = []
train_cols_list : list = train_df.columns.to_list()
for col in train_cols_list:
    if len(train_df[col].drop_duplicates()) > 100:
        continue
    print("----------------------------")
    plot_bar(x_df=train_df[col] , y_df=train_df["Survived"])
    dummie_col_list.append(col)



### 年齢だけは相関が把握しきれていないので、分布図に起こす

In [None]:
def create_age_group_dummy(df : pd.DataFrame) -> pd.DataFrame:
    INTERVAL_BIN = 10

    max_age : int = df["Age"].max().astype(int)
    min_age : int = df["Age"].min().astype(int)
    bins : list = [num for num in range(min_age , max_age  + INTERVAL_BIN , INTERVAL_BIN)]
    labels : list = [f"{bins[i]}-{bins[i + 1]}" for i in range(len(bins) -1)]
    df["Age_Group"]  = pd.cut(df["Age"] , bins=bins , labels=labels , right=False)

    # plot_bar(df["Age_Group"] , df["Survived"])
    age_group_mode : str = df["Age_Group"].mode().values[0]
    df["Age_Group"] = df["Age_Group"].fillna(age_group_mode)

    ## この結果をもとに年齢区間のdummy変数を作成する
    age_dummies = pd.get_dummies(df["Age_Group"] , prefix="Age" , dtype=int)
    return age_dummies


In [None]:
excluede_dummies = [
    "Survived" ,
    "Age",
    # "Embarked",
    ]
new_dummie_col_list = [col for col in dummie_col_list if col not in excluede_dummies]
new_dummie_col_list

## ほかカラムの前処理

In [None]:
def pre_exec(df : pd.DataFrame) -> pd.DataFrame:
    # train_df["Cabin"] = train_df["Cabin"].fillna("")
    df["Embarked"] = df["Embarked"].fillna(
        df["Embarked"].mode().values[0]
    )
    new_df = df[new_dummie_col_list].copy()
    new_df.info()
    return new_df
    

## ダミー処理

In [None]:
def create_dummy_df(df : pd.DataFrame ) -> pd.DataFrame:
    age_dummies : pd.DataFrame = create_age_group_dummy(df)
    df = pre_exec(df)
    
    for col in ["Pclass" ,
                # "Parch" , 
                # "SibSp"
                ]:
        
        df[col] = df[col].astype("category")
    x = pd.get_dummies(df,
                    drop_first=True,
                    dtype=int)
    x = pd.concat([x , age_dummies] , axis=1)
    x.insert(0 , 'intercept' , 1)
    return x

In [None]:
# train_df

x = create_dummy_df(train_df)
x.columns.tolist()

## モデルのフィッティング

In [None]:
print(x.info())

In [None]:
# モデルのフィッティング
md = sm.Logit(train_df["Survived"] , x)
model = md.fit()
print(model.summary())



## 予測を行うテストデータの前処理

In [None]:
test_dummy_df = create_dummy_df(test_df)
test_dummy_df.columns.tolist()

In [None]:

predict_values = model.predict(test_dummy_df)
# # しきい値を 0.5 に設定し、0 or 1 に変換
predict_binary = (predict_values >= 0.5).astype(int)
# merge_df  = pd.DataFrame(columns={
#     "PassengerId" : test_df["PassengerId"],
#     "Survived" : predict_binary
# })
merge_df  = pd.DataFrame({
    "PassengerId" : test_df["PassengerId"],
    "Survived" : predict_binary,
})



os.makedirs("./out" , exist_ok=True)

merge_df.to_csv("./out/result.csv", index=False)
merge_df


## 提出

In [None]:
! cd out & kaggle competitions submit titanic -f result.csv -m "api submission"