# ロジスティック回帰で予測するサンプル

In [1]:
import pandas as pd
import numpy as np

import re

from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
df = pd.read_csv('./datasets/titanic/train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [3]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,0.363636,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.481622,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,0.0,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,0.0,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,1.0,3.0,39.0,1.0,0.0,31.5
max,1309.0,1.0,3.0,76.0,8.0,9.0,512.3292


In [4]:
test_df = pd.read_csv('./datasets/titanic/test.csv')
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


## 名前の前処理family name, last nameに分割

In [5]:
title_pattern = re.compile('[a-zA-Z]+\.\s')
LAST='LastName'
FAMILY='FamilyName'

def _preprocess_name(df: pd.DataFrame) -> pd.DataFrame:
    name_df = df['Name'].str.split(',', expand=True)
    name_df.columns = [LAST, FAMILY]
    name_df[FAMILY] = name_df[FAMILY].str.replace(title_pattern, '', regex=True).str.strip()
    df = df.drop(columns='Name')
    name_df = name_df.drop(columns='LastName')
    return pd.concat([df, name_df], axis=1)

## pclassの前処理 欠損埋めしてスケーリング

In [6]:
PCLASS= 'Pclass'

def _preprocess_pclass(df: pd.DataFrame) -> pd.DataFrame:
    df[PCLASS] = df[PCLASS].fillna(0)
    
    scaler = MinMaxScaler()
    scaler.fit(df[[PCLASS]])
    
    df[PCLASS] = scaler.transform(df[[PCLASS]])
    return df
    

## Age, Fareの前処理　欠損埋めしてスケーリング

In [7]:
AGE = 'Age'
FARE = 'Fare'
SIBSP = 'SibSp'
PARCH = 'Parch'


def _preprocess_age_and_fare_sibsp_parch(df: pd.DataFrame) -> pd.DataFrame:
    df['Age_isnull'] = df[AGE].isnull().astype(int)
    df['Fare_isnull'] = df[FARE].isnull().astype(int)
    
    mean_age = df[AGE].mean()
    mean_fare = df[FARE].mean()
    
    df[AGE] = df[AGE].fillna(mean_age)
    df[FARE] = df[FARE].fillna(mean_fare)
    df[SIBSP] = df[SIBSP].fillna(0)
    df[PARCH] = df[PARCH].fillna(0)
    
    scaler = StandardScaler()
    scaler.fit(df[[AGE, FARE, SIBSP, PARCH]])
    
    df[[AGE, FARE, SIBSP, PARCH]] = scaler.transform(df[[AGE, FARE, SIBSP, PARCH]])
    return df
    
    

In [8]:
def _preprocess_sex_cabin_embarked(df: pd.DataFrame) -> pd.DataFrame:
    dummy_df = pd.get_dummies(df, drop_first=True, dummy_na=True, columns=['Sex', 'Cabin', 'Embarked'])
    return dummy_df

In [9]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop(columns=['PassengerId', 'Ticket', 'Name'])

    # df = _preprocess_name(df)
    df = _preprocess_pclass(df)
    df = _preprocess_age_and_fare_sibsp_parch(df)
    df = _preprocess_sex_cabin_embarked(df)
    return df

In [10]:
prep_df = preprocess(df)

In [11]:
prep_df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Age_isnull,Fare_isnull,Sex_male,Sex_nan,...,Cabin_F E57,Cabin_F G63,Cabin_F2,Cabin_F33,Cabin_F4,Cabin_G6,Cabin_nan,Embarked_Q,Embarked_S,Embarked_nan
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,...,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,0.363636,0.632775,3.187243e-17,-1.5936220000000003e-17,-2.9747600000000003e-17,-1.8061040000000002e-17,0.205742,0.002392,0.636364,0.0,...,0.002392,0.002392,0.002392,0.002392,0.004785,0.002392,0.782297,0.110048,0.645933,0.0
std,0.481622,0.420919,1.001198,1.001198,1.001198,1.001198,0.404727,0.048912,0.481622,0.0,...,0.048912,0.048912,0.048912,0.048912,0.069088,0.048912,0.413179,0.313324,0.478803,0.0
min,0.0,0.0,-2.385419,-0.49947,-0.4002477,-0.6387815,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,-0.5763018,-0.49947,-0.4002477,-0.4972129,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,-0.49947,-0.4002477,-0.3796234,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
75%,1.0,1.0,0.4340463,0.6169924,-0.4002477,-0.07399887,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
max,1.0,1.0,3.623577,8.432229,8.781044,8.547081,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## 25%でhold-outしてバリデーションデータを作成

In [13]:
from sklearn.model_selection import train_test_split

# ① 学習用データとバリデーションデータを用意
df_x = prep_df.drop(columns=['Survived'])
df_y = prep_df['Survived']

train_x, valid_x, train_y, valid_y = train_test_split(df_x, df_y, test_size=0.25, random_state=42)

## 学習

In [48]:
from keras.models import Sequential
from keras.layers import BatchNormalization, Dense, Dropout
from keras import optimizers

model = Sequential([
    Dense(units=train_x.shape[1], activation='relu'), # 入力層は列数分のユニット
    Dropout(rate=0.2),
    BatchNormalization(),
    Dense(units=16, activation='relu'),
    Dropout(rate=0.2),
    Dense(units=1, activation='sigmoid')              # 出力層は1ユニット
])

model.compile(optimizer=optimizers.SGD(learning_rate=0.01), loss='binary_crossentropy', metrics=['binary_crossentropy'])
model.fit(x=train_x, y=train_y, validation_data=(valid_x, valid_y), batch_size=32, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x13e4fff40>

In [24]:
train_x

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Age_isnull,Fare_isnull,Sex_male,Sex_nan,Cabin_A18,...,Cabin_F E57,Cabin_F G63,Cabin_F2,Cabin_F33,Cabin_F4,Cabin_G6,Cabin_nan,Embarked_Q,Embarked_S,Embarked_nan
132,1.0,0.000000,-0.49947,3.680326,-0.182174,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
60,1.0,-1.051760,-0.49947,-0.400248,-0.497213,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
198,0.5,-0.576302,-0.49947,-0.400248,-0.450521,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
332,1.0,0.000000,-0.49947,-0.400248,-0.509240,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
63,1.0,-0.655545,-0.49947,-0.400248,-0.500275,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,1.0,-0.734788,-0.49947,-0.400248,-0.497213,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
106,1.0,-0.734788,-0.49947,-0.400248,-0.498558,0,0,1,0,0,...,0,0,0,0,0,0,1,1,0,0
270,0.0,1.246287,-0.49947,-0.400248,0.710273,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
348,0.5,-0.497059,-0.49947,-0.400248,-0.396732,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0


## バリデーションデータで評価

In [None]:
from sklearn.metrics import log_loss

In [None]:
y_pred_train = model.predict_proba(train_x)
y_pred = model.predict_proba(valid_x)

logloss_train = log_loss(y_true=train_y, y_pred=y_pred_train)
logloss_valid = log_loss(y_true=valid_y, y_pred=y_pred)

In [None]:
print(f"logloss - train: {logloss_train}")
print(f"logloss - valid: {logloss_valid}")

In [None]:
import joblib
joblib.dump(model, 'model.joblib')

model = joblib.load('model.joblib')
m2

In [None]:
prep_test_df = preprocess(test_df)

In [None]:
model.predict_proba(prep_test_df)