In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from importlib import reload

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
import sys
sys.path.append("../src/")

from eda import basic
from features import build_features
from models.model import logistic_regression
from models.model import support_vecotor_classification
from models.model import random_forest
from models.model import k_neighbors_classifier

In [5]:
# reload()

In [6]:
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)
combine = [train_df, test_df]

# 基礎集計

In [7]:
basic.basic_info(train_df)

データ数 (891, 12)
カラム ['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']
dfの上下5行


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,0,S


カラム別データ数と型
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


None

統計情報


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,23.799293,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,17.596074,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,6.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,24.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
basic.groupby_mean(train_df, "Pclass", ['Pclass', 'Survived'])

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


# 特徴量

In [9]:
# 特徴量作成
replace_title={
    "Miss": "Mlle",
    "Miss": "Ms",
    "Mrs": "Mme",
    "Rare": ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 
                'Rev', 'Sir', 'Jonkheer', 'Dona']
}
mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

funcs = {
    build_features.create_title: (replace_title, "title"),
    build_features.create_mapping: (mapping, "title")
}

train  = build_features.ModelDF(train_df, funcs=funcs)
train.create_feature()

test = build_features.ModelDF(test_df, funcs=funcs)
test.create_feature()

In [10]:
# 特徴量削除
categorical_columns = ["Name", "Sex", "Embarked", "PassengerId"]
numbering_columns = ['Ticket', 'Cabin']

train.df = train.df.drop(numbering_columns, axis=1)
train.df = train.df.drop(categorical_columns, axis=1)
test.df = test.df.drop(categorical_columns, axis=1)
test.df = test.df.drop(numbering_columns, axis=1)

In [11]:
# 学習用と予測用に分割
train.create_y_column("Survived")
X_train = train.df
Y_train = train.y

X_test  = test.df.copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 6), (891,), (418, 6))

In [12]:
X_train.head(2)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,title
0,3,22.0,1,0,7.25,1.0
1,1,38.0,1,0,71.2833,3.0


In [13]:
Y_train.head(2)

0    0
1    1
Name: Survived, dtype: int64

# モデル

In [17]:
acc_lr = logistic_regression(X_train, Y_train, X_test)
acc_svc = support_vecotor_classification(X_train, Y_train, X_test)
acc_knn = k_neighbors_classifier(X_train, Y_train, X_test)
acc_rf = random_forest(X_train, Y_train, X_test)

# モデル比較

In [18]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest'],
    'Score': [acc_svc, acc_knn, acc_lr, acc_rf]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,98.43
1,KNN,84.4
2,Logistic Regression,76.88
0,Support Vector Machines,68.46


In [52]:
# submission = pd.DataFrame({
#         "PassengerId": test_df["PassengerId"],
#         "Survived": Y_pred
#     })
# # submission.to_csv('../output/submission.csv', index=False)