# 라이브러리 로드

In [39]:
# 데이터 분석에 사용할 라이브러리
import pandas as pd
import numpy as np 
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

# error 창 안 보이게 해줌
import logging
logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR)

# 가용폰트 확인
for font in mpl.font_manager.findSystemFonts():
    # print(mpl.font_manager.FontProperties(fname=font).get_name())
    mpl.font_manager.fontManager.addfont(font)

# Set font family globally
mpl.rcParams['font.family'] = 'NanumBarunGothic'# 폰트명
print(mpl.rcParams['font.family'])


['NanumBarunGothic']


# 데이터 로드

In [40]:
# pip install easydict

In [41]:
import easydict
args = easydict.EasyDict()

# path 정보
args.default_path = 'C:/titanic_datas/'
args.train_csv = args.default_path+'train.csv'
args.test_csv = args.default_path+'test.csv'
args.default_submission_csv = args.default_path+'submission.csv'

args.submission_csv = args.default_path+'result/submission_0220.csv'
args.save_results = args.default_path+"result/model_results.json"

# 데이터 분석을 위한 변수들
args.random_state = 21
args.results = []

# Titanic Data

- Surived:0=사망, 1=생존
- Pclass: 1=1등석, 2=2등석, 3=3등석
- gender:male=남성, female=여성
- Age: 나이
- SibSp: 타이타닉 호에 동승한 자매/배우자의 수
- Parch: 타이타닉 호에 동승한 부모/자식의 수
- Ticket: 티켓 번호
- Fare: 승객 요금
- Cabin: 방 호수
- Embarked: 탑승지; C=셰르부르, Q=퀴즈타운, S=사우샘프턴

In [42]:
DATA_PATH = "C:/titanic_datas/"

df = pd.read_csv(DATA_PATH+"train.csv")
df.head()

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
1,1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
2,2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
3,3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S


In [43]:
plt.style.use('fivethirtyeight')
plt.ion()

import warnings
warnings.filterwarnings('ignore')

In [44]:
ori_train = pd.read_csv(args.train_csv)
ori_test = pd.read_csv(args.test_csv)

ori_train.shape, ori_test.shape

((916, 12), (393, 11))

In [45]:
pd.read_csv(args.default_submission_csv).shape

(393, 2)

In [46]:
ori_train.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'gender', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [47]:
ori_train.columns = [col.lower() for col in ori_train.columns] # 컬럼명 소문자로 변환
ori_train.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'gender', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [48]:
ori_test.columns = [col.lower() for col in ori_test.columns] # 컬럼명 소문자로 변환
ori_test.columns

Index(['passengerid', 'pclass', 'name', 'gender', 'age', 'sibsp', 'parch',
       'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [49]:
ori_train.head()

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
1,1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
2,2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
3,3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S


In [50]:
ori_train['passengerid'].nunique(), ori_train.shape[0]

(916, 916)

In [51]:
ori_train.drop('passengerid', axis=1, inplace=True)
ori_train.head()

Unnamed: 0,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S


In [52]:
ori_test.set_index(['passengerid'], inplace=True) 
print(f'{ori_test.shape}')
ori_test.head()

(393, 10)


Unnamed: 0_level_0,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
916,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q
917,2,"Pinsky, Mrs. (Rosa)",female,32.0,0,0,234604,13.0,,S
918,3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q
919,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S
920,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45.0,1,1,36928,164.8667,,S


In [53]:
ori_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  916 non-null    int64  
 1   pclass    916 non-null    int64  
 2   name      916 non-null    object 
 3   gender    916 non-null    object 
 4   age       736 non-null    float64
 5   sibsp     916 non-null    int64  
 6   parch     916 non-null    int64  
 7   ticket    916 non-null    object 
 8   fare      916 non-null    float64
 9   cabin     198 non-null    object 
 10  embarked  915 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 78.8+ KB


# train과 test 분리

In [54]:
new_survived = pd.Categorical(ori_train["survived"])
# ori_train['new_survived'] = ori_train['survived'].astype('category')
new_survived = new_survived.rename_categories(["Died", "Survived"])

new_survived.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Died,570,0.622271
Survived,346,0.377729


In [55]:
from sklearn.model_selection import train_test_split

In [56]:
y = ori_train['survived'] # target data, vector(1차원)
X = ori_train.drop(['survived'], axis=1) # feature data, matrix(2차원)

X

Unnamed: 0,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.8750,,S
1,3,"Henry, Miss. Delia",female,,0,0,382649,7.7500,,Q
2,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5000,B69,S
3,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0000,,S
...,...,...,...,...,...,...,...,...,...,...
911,3,"Thomas, Mr. Charles P",male,,1,0,2621,6.4375,,C
912,3,"Cacic, Mr. Jego Grga",male,18.0,0,0,315091,8.6625,,S
913,2,"Pengelly, Mr. Frederick William",male,19.0,0,0,28665,10.5000,,S
914,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q


In [57]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=ori_train['survived'], random_state=args.random_state)

X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((641, 10), (275, 10), (641,), (275,))

# Base ModelV0

In [58]:
train = X_tr.copy() 
test = X_te.copy()
ori_te = ori_test.copy()

train.shape, test.shape, ori_te.shape

((641, 10), (275, 10), (393, 10))

## Data Preprocessing

In [59]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 641 entries, 812 to 277
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    641 non-null    int64  
 1   name      641 non-null    object 
 2   gender    641 non-null    object 
 3   age       512 non-null    float64
 4   sibsp     641 non-null    int64  
 5   parch     641 non-null    int64  
 6   ticket    641 non-null    object 
 7   fare      641 non-null    float64
 8   cabin     135 non-null    object 
 9   embarked  641 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 55.1+ KB


In [60]:
print(f'before: {train.shape} / {test.shape}')
drop_cols = ['name', 'ticket', 'cabin']

train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)
ori_te.drop(drop_cols, axis=1, inplace=True)

print(f'after: {train.shape} / {test.shape}')
train.info()

before: (641, 10) / (275, 10)
after: (641, 7) / (275, 7)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 641 entries, 812 to 277
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    641 non-null    int64  
 1   gender    641 non-null    object 
 2   age       512 non-null    float64
 3   sibsp     641 non-null    int64  
 4   parch     641 non-null    int64  
 5   fare      641 non-null    float64
 6   embarked  641 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 40.1+ KB


In [61]:
print(ori_te.shape)

(393, 7)


In [62]:
train.isnull().sum()

pclass        0
gender        0
age         129
sibsp         0
parch         0
fare          0
embarked      0
dtype: int64

In [63]:
test.isnull().sum()

pclass       0
gender       0
age         51
sibsp        0
parch        0
fare         0
embarked     1
dtype: int64

In [64]:
ori_te.isnull().sum()

pclass       0
gender       0
age         83
sibsp        0
parch        0
fare         1
embarked     1
dtype: int64

In [65]:
age_median = train['age'].median()
fare_median = train['fare'].median()
embarked_mode = train['embarked'].mode().values[0]

age_median, fare_median, embarked_mode

(28.0, 14.4, 'S')

In [66]:
train['age'].fillna(age_median, inplace=True)
test['age'].fillna(age_median, inplace=True)
ori_te['age'].fillna(age_median, inplace=True)

train['fare'].fillna(fare_median, inplace=True)
test['fare'].fillna(fare_median, inplace=True)
ori_te['fare'].fillna(fare_median, inplace=True)

train['embarked'].fillna(embarked_mode, inplace=True)
test['embarked'].fillna(embarked_mode, inplace=True)
ori_te['embarked'].fillna(embarked_mode, inplace=True)

train.isnull().sum().sum(), test.isnull().sum().sum(), ori_te.isnull().sum().sum()

(0, 0, 0)

In [67]:
print(ori_te.shape)

(393, 7)


## Data Encoding

In [68]:
from sklearn.preprocessing import OneHotEncoder

In [69]:
train.columns

Index(['pclass', 'gender', 'age', 'sibsp', 'parch', 'fare', 'embarked'], dtype='object')

In [70]:
enc_cols = ['gender', 'embarked']
normal_cols = list(set(train.columns) - set(enc_cols))
normal_cols

['parch', 'sibsp', 'fare', 'pclass', 'age']

In [71]:
print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()  # 문자를 숫자로 변환
# train
tmp_tr = pd.DataFrame(
    enc.fit_transform(train[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_tr = pd.concat(
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
)
# test
tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)
# ori_test
tmp_te = pd.DataFrame(
    enc.transform(ori_te[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_ori_te = pd.concat(
    [ori_te[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')
enc_tr.head()

before: (641, 7) / (275, 7)
after: (641, 10) / (275, 10)


Unnamed: 0,parch,sibsp,fare,pclass,age,gender_female,gender_male,embarked_C,embarked_Q,embarked_S
0,0,0,7.775,3,22.0,1.0,0.0,0.0,0.0,1.0
1,0,0,7.8208,3,21.0,0.0,1.0,0.0,1.0,0.0
2,0,0,7.8542,3,32.0,0.0,1.0,0.0,0.0,1.0
3,0,0,18.7875,3,11.0,0.0,1.0,1.0,0.0,0.0
4,0,0,8.05,3,30.0,0.0,1.0,0.0,0.0,1.0


In [72]:
print(ori_te.shape)

(393, 7)


## Training

In [73]:
enc_tr.isnull().sum().sum(), enc_te.isnull().sum().sum(), enc_ori_te.isnull().sum().sum()

(0, 0, 0)

In [74]:
enc_tr.shape, enc_te.shape, enc_ori_te.shape

((641, 10), (275, 10), (393, 10))

In [75]:
from sklearn.tree import DecisionTreeClassifier

In [76]:
modelV0 = DecisionTreeClassifier(random_state=args.random_state)

print(f'{enc_tr.shape} / {y_tr.shape}')
modelV0.fit(enc_tr, y_tr)

(641, 10) / (641,)


DecisionTreeClassifier(random_state=21)

## Evaluation

In [77]:
score_tr = modelV0.score(enc_tr, y_tr)
score_te = modelV0.score(enc_te, y_te) 

score_tr, score_te 

(0.982839313572543, 0.7781818181818182)

In [78]:
from sklearn.metrics import roc_curve, auc 

y_pred = modelV0.predict_proba(enc_te)[:,1]  # proba: 확률 예측
fpr, tpr, thresholds = roc_curve(y_te,y_pred)

auc_te = auc(fpr, tpr)
print(f'model: {auc_te}')

model: 0.7666160593792173


In [79]:
ori_te_pred = modelV0.predict_proba(enc_ori_te)[:,1]
ori_te_pred.shape

(393,)

In [80]:
modelV0.feature_importances_

array([0.00739608, 0.03987511, 0.2016825 , 0.0638137 , 0.17230406,
       0.49297873, 0.        , 0.00766071, 0.        , 0.01428912])

In [81]:
df_feature_importances = pd.DataFrame(modelV0.feature_importances_, enc_tr.columns).sort_values(by=[0], ascending=False).reset_index()

print(f'{df_feature_importances.shape}')
df_feature_importances

(10, 2)


Unnamed: 0,index,0
0,gender_female,0.492979
1,fare,0.201682
2,age,0.172304
3,pclass,0.063814
4,sibsp,0.039875
5,embarked_S,0.014289
6,embarked_C,0.007661
7,parch,0.007396
8,gender_male,0.0
9,embarked_Q,0.0


In [82]:
args.results.append(
    {
        'model': 'modelV0',
        'score_tr': score_tr,
        'score_te': score_te,
        'auc_te': auc_te,
        'ori_te_pred': ori_te_pred,
        'len_features': X_tr.shape[1],
        'feaute_importances': list(df_feature_importances['index'].values[:X_tr.shape[1]]),
        'create_dt': '0217'
    }
)

args.results

[{'model': 'modelV0',
  'score_tr': 0.982839313572543,
  'score_te': 0.7781818181818182,
  'auc_te': 0.7666160593792173,
  'ori_te_pred': array([1.        , 1.        , 1.        , 0.        , 1.        ,
         1.        , 0.        , 0.125     , 1.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 1.        ,
         0.        , 1.        , 0.        , 0.        , 0.125     ,
         0.        , 0.        , 0.        , 0.        , 0.125     ,
         0.        , 0.        , 1.        , 1.        , 0.        ,
         1.        , 0.        , 0.        , 0.        , 0.        ,
         0.5       , 1.        , 0.        , 0.        , 0.        ,
         0.        , 1.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 0.        , 1.        , 0.        , 1.        ,
         1.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 1.        , 1. 

# Submission

In [83]:
df_results = pd.DataFrame(args.results).sort_values(by=['auc_te'], ascending=False)
df_results

Unnamed: 0,model,score_tr,score_te,auc_te,ori_te_pred,len_features,feaute_importances,create_dt
0,modelV0,0.982839,0.778182,0.766616,"[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.125, 1.0...",10,"[gender_female, fare, age, pclass, sibsp, emba...",217


In [84]:
list_result = df_results.loc[:, 'ori_te_pred'].to_list()
len(list_result[0])

393

In [85]:
len(submission)

NameError: name 'submission' is not defined

In [None]:
submission = pd.read_csv(args.default_submission_csv)
submission.head()

Unnamed: 0,passengerid,survived
0,916,0.5
1,917,0.5
2,918,0.5
3,919,0.5
4,920,0.5


In [None]:
submission['survived'] = df_results.loc[0, ['ori_te_pred']].values[0]
print(f'{submission.isnull().sum().sum()}')

0


In [None]:
submission.head()

Unnamed: 0,passengerid,survived
0,916,1.0
1,917,1.0
2,918,1.0
3,919,0.0
4,920,1.0


In [None]:
submission.to_csv(args.submission_csv, header=True, index=False)