In [1]:
# GoogleDriveからcsvファイルコピー
# !cp '/content/drive/My Drive/ai_lesson/projects/kaggle/titanic/data/'*'.csv' ./

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test = test.fillna({'Fare': train[train['Pclass'] == 3]['Fare'].mean()})

avg_ages = {
    1: {
        'male': int(train[(train['Pclass'] == 1) & (train['Sex'] == 'male')]['Age'].mean())
        , 'female': int(train[(train['Pclass'] == 1) & (train['Sex'] == 'female')]['Age'].mean())
    },
    2: {
        'male': int(train[(train['Pclass'] == 2) & (train['Sex'] == 'male')]['Age'].mean())
        , 'female': int(train[(train['Pclass'] == 2) & (train['Sex'] == 'female')]['Age'].mean())
    },
    3: {
        'male': int(train[(train['Pclass'] == 3) & (train['Sex'] == 'male')]['Age'].mean())
        , 'female': int(train[(train['Pclass'] == 3) & (train['Sex'] == 'female')]['Age'].mean())
    }
}

use_list = [
            'Survived'
            , 'NickName'
            , 'Miss'
            , 'Mrs'
            , 'Mr'
            , 'Mme'
            , 'Age'
            , 'Pclass'
            , 'TicketHeader'
            , 'TicketNumSlice'
            , 'Fare'
            , 'Cabin'
]

In [3]:
def fill_age(df):
    for i in sorted(train['Pclass'].value_counts().index):
        i = int(i)
        for sex in train['Sex'].value_counts().index:
            df.loc[(df['Pclass'] == i) & (df['Sex'] == sex) & (df['Age'].isnull()), 'Age'] = avg_ages[i][sex]
    return df


def build_df(df, is_train):
    df_shaped = df.copy()
    df_shaped['Age'] = fill_age(df)['Age']

    if 'NickName' in use_list:
        df_shaped.loc[df_shaped['Name'].str.contains('\('), 'NickName'] = 1
        df_shaped.loc[~(df_shaped['Name'].str.contains('\(')), 'NickName'] = 0
        df_shaped['NickName'] = df_shaped['NickName'].astype(int)

    if 'Miss' in use_list:
        df_shaped.loc[(df_shaped['Name'].str.contains('Miss\.')), 'Miss'] = 1
        df_shaped.loc[~(df_shaped['Name'].str.contains('Miss\.')), 'Miss'] = 0
        df_shaped['Miss'] = df_shaped['Miss'].astype(int)

    if 'Mrs' in use_list:
        df_shaped.loc[(df_shaped['Name'].str.contains('Mrs\.')), 'Mrs'] = 1
        df_shaped.loc[~(df_shaped['Name'].str.contains('Mrs\.')), 'Mrs'] = 0
        df_shaped['Mrs'] = df_shaped['Mrs'].astype(int)

    if 'Mr' in use_list:
        df_shaped.loc[(df_shaped['Name'].str.contains('Mr\.')), 'Mr'] = 1
        df_shaped.loc[~(df_shaped['Name'].str.contains('Mr\.')), 'Mr'] = 0
        df_shaped['Mr'] = df_shaped['Mr'].astype(int)

    if 'Mme' in use_list:
        df_shaped.loc[(df_shaped['Name'].str.contains('Mme\.')), 'Mme'] = 1
        df_shaped.loc[~(df_shaped['Name'].str.contains('Mme\.')), 'Mme'] = 0
        df_shaped['Mme'] = df_shaped['Mme'].astype(int)

    if 'TicketHeader' in use_list:
        tk_header = []
        for tk in df_shaped['Ticket']:
            tk = tk.upper()
            if ' ' in tk:
                tk_split = ''.join(tk.split(' ')[:-1])
                tk = tk_split.replace('.', '').replace('/', '')
            else:
                tk = 'ZZ'
            tk_header.append(tk[:3])
        df_shaped['TicketHeader'] = tk_header

    if 'TicketNumSlice' in use_list:
        ticket_num = []
        for tk in df_shaped['Ticket']:
            tk = tk.strip()
            if '. ' in tk:
                tk_split = tk.split('. ')[-1]
                tk_split = tk_split[1:4] if len(tk_split) > 1 else tk_split
                if tk_split == '':
                    ticket_num.append(tk)
                else:
                    ticket_num.append(tk_split)
            elif ' ' in tk:
                tk = tk.split(' ')[-1][1:4]
                ticket_num.append(tk)
            else:
                try:
                    tk = int(tk)
                    ticket_num.append(str(tk)[1:4])
                except:
                    ticket_num.append(0)
        df_shaped['TicketNumSlice'] = ticket_num

    if 'TicketLength' in use_list:
        df_shaped['TicketLength'] = df_shaped['Ticket'].str.len()

    if 'Cabin' in use_list:
        df_shaped['Cabin'] = df_shaped['Cabin'].fillna('Z')
        df_shaped['Cabin'] = df_shaped['Cabin'].str.split(' ', expand=True)[0].str[:2]
    
    df_shaped = df_shaped.loc[:, use_list] if is_train else df_shaped.loc[:, use_list[1:]]
    df_shaped = pd.get_dummies(df_shaped, drop_first=True)
    return df_shaped

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate


train_shaped = build_df(train, True)
test_shaped = build_df(test, False)

train_columns = list(train_shaped.drop('Survived', axis=1).columns)
test_columns = list(test_shaped.columns)
columns = []
columns.extend(train_columns)
columns.extend(test_columns)
columns = list(set(columns))

for col in columns:
    if not col in train_columns:
        train_shaped[col] = 0
    
    if not col in test_columns:
        test_shaped[col] = 0

train_shaped.sort_index(axis=1, inplace=True)
test_shaped.sort_index(axis=1, inplace=True)

print(use_list[1:])

X = train_shaped.drop('Survived', axis=1)
y = train_shaped['Survived']
test_size = 0.3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
scoring = {
    'accuracy': 'accuracy'
    , 'f': 'f1_macro'
}

model = LogisticRegression(penalty='l2', C=0.9, max_iter=700)
scores = cross_validate(model, X, y, cv=10, scoring=scoring)
model.fit(X_train, y_train)

np.set_printoptions(precision=4)
for key, val in scores.items():
    print('・{}:{}'.format(key, val))
    print('　{}_mean: {:.6f}'.format(key.split('_')[1], val.mean()))

print('正解率(train)：{:.6f}'.format(model.score(X_train, y_train)))
print('正解率(test) ：{:.6f}'.format(model.score(X_test, y_test)))
print(f'{model.score(X_train, y_train) - model.score(X_test, y_test):.3f}')

['NickName', 'Miss', 'Mrs', 'Mr', 'Mme', 'Age', 'Pclass', 'TicketHeader', 'TicketNumSlice', 'Fare', 'Cabin']
・fit_time:[0.3631 0.3655 0.3938 0.4484 0.4061 0.3745 0.393  0.4853 0.4169 0.3682]
　time_mean: 0.401479
・score_time:[0.0036 0.0036 0.0036 0.0036 0.0036 0.0035 0.0036 0.0036 0.0035 0.0035]
　time_mean: 0.003563
・test_accuracy:[0.8222 0.8427 0.8202 0.8876 0.8876 0.8202 0.8539 0.8202 0.8652 0.8315]
　accuracy_mean: 0.845144
・test_f:[0.8109 0.8352 0.7989 0.8823 0.881  0.8048 0.8403 0.8048 0.8572 0.8243]
　f_mean: 0.833972
正解率(train)：0.889246
正解率(test) ：0.843284
0.046


In [5]:
pred = model.predict(test_shaped)
sample = pd.read_csv('gender_submission.csv')

sample['Survived'] = pred
sample.to_csv('submit.csv', index=None)
sample.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
