In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import statsmodels.formula.api as smf
import statsmodels.api as sm

  from pandas import Int64Index as NumericIndex


# read

In [2]:
TRAIN_PATH = "/workspace/data/titanic/train.csv"
TEST_PATH = "/workspace/data/titanic/test.csv"

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

# EDA後の変更

## 以下のカラムはモデリングに使わない

In [3]:
drop_columns = ['Name', 'Ticket', 'Cabin', 'Embarked']

train_df = train_df.drop(drop_columns, axis=1)
test_df = test_df.drop(drop_columns, axis=1)

## カテゴリ変数を数値変数に変更

In [4]:
# カテゴリデータ一のリスト
type_df = pd.DataFrame(train_df.dtypes.reset_index())
type_df = type_df.rename(columns={0:"type"})
type_df['type'] = type_df['type'].apply(lambda x: str(x))
str_columns = list(type_df.query("type in 'object'")["index"].unique())

# label encoder
for c in str_columns:
    le = LabelEncoder()
    le.fit(train_df[c])
    train_df[c] = le.transform(train_df[c])
    test_df[c] = le.transform(test_df[c])

## AgeのNULL埋めを「Pclass × Sex」による平均年齢

In [5]:
train_list = []
test_list = []
for i, pclass in enumerate(sorted(train_df['Pclass'].unique())):
    for n, sex in enumerate(sorted(train_df['Sex'].unique())):
        
        _df = train_df.query("(Pclass == @pclass) & (Sex == @sex)").copy()
        _df = _df.fillna({"Age":_df['Age'].mean()})
        train_list.append(_df)
        
        _df = test_df.query("(Pclass == @pclass) & (Sex == @sex)").copy()
        _df = _df.fillna({"Age":_df['Age'].mean()})
        test_list.append(_df)
        
train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

train_df = train_df.sort_values("PassengerId")
test_df = test_df.sort_values("PassengerId")

## 標準化
※ 正則化にL2ノルムを使用するため

In [6]:
num_cols = list(train_df.drop(["PassengerId", "Survived"], axis=1).columns)

scaler = StandardScaler()
scaler.fit(train_df[num_cols])

train_df[num_cols] = scaler.transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

# ロジスティック回帰

線形予測子の変更によるAICの確認

In [7]:
formula_list = [
    "Survived ~ Pclass + Sex + Age",
    "Survived ~ Pclass + Sex + Age + Fare",
    "Survived ~ Pclass + Sex + Age + SibSp + Parch",
    "Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare"]

for i, formula in enumerate(formula_list):
    logistic = smf.glm(formula=formula, data=train_df, family=sm.families.Binomial()).fit()
    aic = round(logistic.aic, 2)
    print(f" AIC:{aic} formula:{formula}")
    
    if i == 0:
        min_aic = aic
        min_formula = formula
    else:
        if aic < min_aic:
            min_aic = aic
            min_formula = formula
            
print("="*50)
print("min")
print(f" AIC:{min_aic} formula:{min_formula}")

 AIC:809.31 formula:Survived ~ Pclass + Sex + Age
 AIC:811.3 formula:Survived ~ Pclass + Sex + Age + Fare
 AIC:796.62 formula:Survived ~ Pclass + Sex + Age + SibSp + Parch
 AIC:797.39 formula:Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare
min
 AIC:796.62 formula:Survived ~ Pclass + Sex + Age + SibSp + Parch


In [9]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,0.827377,0.737695,-0.551366,0.432793,-0.473674,-0.502445
1,2,1,-1.566107,-1.355574,0.65403,0.432793,-0.473674,0.786845
2,3,1,0.827377,-1.355574,-0.250017,-0.474545,-0.473674,-0.488854
3,4,1,-1.566107,-1.355574,0.428018,0.432793,-0.473674,0.42073
4,5,0,0.827377,0.737695,0.428018,-0.474545,-0.473674,-0.486337


## 学習

In [8]:
logistic = smf.glm(formula=min_formula, data=train_df, family=sm.families.Binomial()).fit()
logistic.summary()

0,1,2,3
Dep. Variable:,Survived,No. Observations:,891.0
Model:,GLM,Df Residuals:,885.0
Model Family:,Binomial,Df Model:,5.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-392.31
Date:,"Sun, 06 Feb 2022",Deviance:,784.62
Time:,10:46:06,Pearson chi2:,931.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.3631
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.6611,0.091,-7.237,0.000,-0.840,-0.482
Pclass,-1.0672,0.107,-9.991,0.000,-1.277,-0.858
Sex,-1.3026,0.095,-13.707,0.000,-1.489,-1.116
Age,-0.6037,0.108,-5.604,0.000,-0.815,-0.393
SibSp,-0.3949,0.120,-3.279,0.001,-0.631,-0.159
Parch,-0.0598,0.093,-0.639,0.523,-0.243,0.123


## 予測

In [8]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
0,892,3,1,34.5,0,0,7.8292
1,893,3,0,47.0,1,0,7.0
2,894,2,1,62.0,0,0,9.6875
3,895,3,1,27.0,0,0,8.6625
4,896,3,0,22.0,1,1,12.2875


In [12]:
pred = pd.DataFrame({"Survived":logistic.predict(test_df)})
pred['Survived'] = pred['Survived'].apply(lambda x: round(x))

# Idを合わせる
pred = pred.reset_index()
pred = pred.rename(columns={'index':'PassengerId'})
pred['PassengerId'] = pred['PassengerId'].apply(lambda x: x + 892)

In [14]:
path = "/workspace/data/titanic/result_logistic.csv"
pred.to_csv(path, index=False, encoding='utf-8-sig')

In [15]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,1,22.0,1,0,7.25
1,2,1,1,0,38.0,1,0,71.2833
2,3,1,3,0,26.0,0,0,7.925
3,4,1,1,0,35.0,1,0,53.1
4,5,0,3,1,35.0,0,0,8.05


In [6]:
# X = train_df.drop("Survived", axis=1)
# y = train_df[["Survived"]]

# clf = LogisticRegression(random_state=0).fit(X,y)
# clf.score(X, y)

  y = column_or_1d(y, warn=True)


0.8069584736251403

In [57]:
clf.params()

LogisticRegression(random_state=0)

In [None]:
X = train_df[['petal_length']] # 説明変数
Y = iris_df['species'].map({'versicolor': 0, 'virginica': 1}) # versicolorをクラス0, virginicaをクラス1とする
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0) # 80%のデータを学習データに、20%を検証データにする

lr = LogisticRegression(random_state='1') # ロジスティック回帰モデルのインスタンスを作成
lr.fit(X_train, Y_train) # ロジスティック回帰モデルの重みを学習