In [288]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

# 크롤링
from bs4 import BeautifulSoup
from selenium import webdriver
import time, sys

# 시각화 맵
import folium

# Label encoder
from sklearn.preprocessing import LabelEncoder

# dtclf
from sklearn.tree import DecisionTreeClassifier
finan_dtclf = DecisionTreeClassifier()

In [357]:
# 타이타닉 데이터 전처리

df_t = sns.load_dataset('titanic')
df_t.drop(columns=['class', 'alive', 'embark_town', 'who', 'adult_male', 'alone'], inplace=True)

# 연령의 결측치 해결
age_md = df_t.groupby(['pclass', 'sex']).age.agg(['median'])
df_t.loc[(df_t['sex'] == 'male') & (df_t['pclass'] == 1) & (df_t.age.isna()), "age"] = age_md.loc[1, 'male'][0]
df_t.loc[(df_t['sex'] == 'male') & (df_t['pclass'] == 2) & (df_t.age.isna()), "age"] = age_md.loc[2, 'male'][0]
df_t.loc[(df_t['sex'] == 'male') & (df_t['pclass'] == 3) & (df_t.age.isna()), "age"] = age_md.loc[3, 'male'][0]
df_t.loc[(df_t['sex'] == 'female') & (df_t['pclass'] == 1) & (df_t.age.isna()), "age"] = age_md.loc[1, 'female'][0]
df_t.loc[(df_t['sex'] == 'female') & (df_t['pclass'] == 2) & (df_t.age.isna()), "age"] = age_md.loc[2, 'female'][0]
df_t.loc[(df_t['sex'] == 'female') & (df_t['pclass'] == 3) & (df_t.age.isna()), "age"] = age_md.loc[3, 'female'][0]

# embarked 결측치 해결
df_t.embarked.fillna(df_t.embarked.unique()[0], inplace=True)

# 연령층 별 컬럼 생성.
df_t.loc[df_t.age >= 50, "age_new"] = "old"
df_t.loc[(df_t.age < 50) & (df_t.age>=10), "age_new"] = "young"
df_t.loc[df_t.age < 10, "age_new"] = "baby"

# 불필요 컬럼 제거
df_t.drop(columns=['deck', 'sibsp', 'parch', 'age', 'embarked'], inplace=True)

# df_t.info()
# sex, embarked, age_new 해결해야함

# Labeling으로 문자형 데이터를 숫자형으로 변환
for i in ['sex', 'survived', 'age_new']:
    globals()[f'df_t{i}_encoder'] = LabelEncoder()
    globals()[f'df_t{i}_encoder'].fit(df_t[i])
    df_t[i] = globals()[f'df_t{i}_encoder'].transform(df_t[i])

In [360]:
# 타이타닉 머신러닝 예측 학습

# 머신러닝이 목적으로 할 데이터를 설정
X = df_t.drop(columns='survived')
y = df_t['survived']

from sklearn.tree import DecisionTreeClassifier
finan_dtclf_2 = DecisionTreeClassifier(max_depth=5)

# fit = 머신러닝의 학습의 의미
finan_dtclf_2.fit(X, y)
# 분석-decision tree classification

# 시각화
from sklearn.tree import export_graphviz
import graphviz

export_graphviz(finan_dtclf_2, out_file='finance2.dot', 
                feature_names=X.columns,
                class_names=['생존', '사망'],
                max_depth=5,
                filled=True,
                leaves_parallel=False,
                impurity=True,
                node_ids=False,
                proportion=False,)

with open('./finance2.dot') as f :
    finance2 = f.read()
# graphviz.Source(finance2)

# dot_graph의 source 저장
# dot = graphviz.Source(finance2) 
# png로 저장
# dot.render(filename='tree.png') 

In [361]:
# 새로운 타이타닉 데이터 전처리 

df_test = pd.read_csv('./test.csv')

# df_test.info()
# Name, Sex, Ticket, Cabin, Embarked 해결 필요

# 연령의 결측치 해결
# fare 결측치 해결
age_md = df_test.groupby(['Pclass', 'Sex']).Age.agg(['median'])
fare_md = df_test.groupby(['Pclass', 'Sex']).Fare.agg(['median'])
for i in ['male', 'female'] : 
    for y in range(1, 4) : 
        f"df_test.loc[(df_test['Sex'] == '{i}') & (df_test['Pclass'] == {y}) & (df_test.Age.isna()), 'Age'] = age_md.loc[{y}, '{i}'][0]"
        f"df_test.loc[(df_test['Sex'] == '{i}') & (df_test['Pclass'] == {y}) & (df_test.Fare.isna()), 'Fare'] = fare_md.loc[{y}, '{i}'][0]"
        

# 결측치가 너무 많은 데이터, 컬럼 삭제
df_test.drop(columns=['Cabin'], inplace=True)

# age_new 생성
df_test.loc[df_test.Age >= 50, "age_new"] = "old"
df_test.loc[(df_test.Age < 50) & (df_test.Age>=10), "age_new"] = "young"
df_test.loc[df_test.Age < 10, "age_new"] = "baby"

# 필요 없는 데이터 제거
df_test.drop(columns=['Name', 'Ticket', 'PassengerId', "SibSp", "Parch", 'Age', 'Embarked'], inplace=True)


# 컬럼 소문자로 변경 
l1 = []
for i in list(df_test.columns):
    l1.append(i.lower())
df_test.set_axis(l1, axis='columns', inplace=True)

# Index(['pclass', 'sex', 'age', 'fare', 'embarked', 'predict survived'], dtype='object')
# Labeling으로 문자형 데이터를 숫자형으로 변환
for i in ['sex', 'age_new']:
    globals()[f'df_test{i}_encoder'] = LabelEncoder()
    globals()[f'df_test{i}_encoder'].fit(df_test[i])
    df_test[i] = globals()[f'df_test{i}_encoder'].transform(df_test[i])

df_test['fare'].fillna(df_test['fare'].mean(), inplace=True)

In [362]:
# 새로운 타이타닉 데이터 생존 여부 예측
pred_result = finan_dtclf_2.predict(df_test)
pred_result_2 =  df_tsurvived_encoder.inverse_transform(pred_result)
df_test['survived'] = pred_result_2

In [319]:
# 학습 데이터 70%, 테스트 데이터 30%

from sklearn.model_selection import train_test_split

X = df_t.drop('survived', axis=1)
y = df_t.survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=12)

# 직접구현
# X_train = df_t.iloc[:round(891 * 0.7), 1:]
# X_test = df_t.iloc[round(891 * 0.3), 1:]
# y_train = df_t.iloc[:round(891 * 0.7), 0]
# y_test = df_t.iloc[round(891 * 0.3), 0]

# np.random.shuffle

In [15]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [16]:
# predict이 필요한 이유 : 정답이 없는 경우가 있기 때문.
dt_pred = dt_clf.predict(X_test)

In [25]:
# KFold 학습 실행

from sklearn.model_selection import KFold

# 폴드 세트를 5개인 KFold 객체를 생성, 폴드 수 만큼 예측 결과를 저장을 위한 리스트 객체 생성.
def exe_kfold(clf, folds=5) : 
    
    kFold = KFold(n_splits=folds)
    scores=[]
    
    # KFold 교차 검증 수행
    for i, (train_idx, val_idx) in enumerate(kFold.split(X_train)) :
        # X_train 데이터에서 교차 검증별로 학습과 검증 데이터를 가리키는 index 생성
        X_train_k = X_train.iloc[train_idx]
        y_train_k = y_train.iloc[train_idx]
        X_val_k = X_train.iloc[val_idx]
        y_val_k = y_train.iloc[val_idx]    
        
        # Classfier 학습, 예측, 정확도 계산
        dt_clf.fit(X_train_k, y_train_k)
        scores.append(dt_clf.score(X_val_k, y_val_k))
    
    # 5개 fold에서 평균 정확도 계산
    print(f"{clf} 개별 학습 결과 : ", scores)
    print(f"{clf} 평균 학습 결과 : %.2f" %np.mean(scores))

# DecisionTreeClassifier() 개별 학습 결과 :  [0.7577639751552795, 0.79375, 0.9, 0.8625, 0.84375]
# DecisionTreeClassifier() 평균 학습 결과 : 0.83

In [58]:
dt_clf

DecisionTreeClassifier()

In [45]:
# StratifiedKFold 학습 실행

from sklearn.model_selection import StratifiedKFold

def exe_skfold(clf, folds=5) : 
    
    skFold = StratifiedKFold(n_splits=folds)
    scores=[]
    
    for i, (train_idx, val_idx) in enumerate(skFold.split(X_train, y_train)) :
        X_train_k = X_train.iloc[train_idx]
        y_train_k = y_train.iloc[train_idx]
        X_val_k = X_train.iloc[val_idx]
        y_val_k = y_train.iloc[val_idx]    

        dt_clf.fit(X_train_k, y_train_k)
        scores.append(dt_clf.score(X_val_k, y_val_k))
    print(f"{clf} 개별 학습 결과 : ", scores)
    print(f"{clf} 평균 학습 결과 : %.2f" %np.mean(scores))
    
# DecisionTreeClassifier() 개별 학습 결과 :  [0.7577639751552795, 0.7875, 0.875, 0.84375, 0.85625]
# DecisionTreeClassifier() 평균 학습 결과 : 0.82

In [50]:
# kFold, StratifiedKFold 성능 비교
skFold = StratifiedKFold(n_splits=5)
kFold = KFold(n_splits=5)

for i, (train_idx, val_idx) in enumerate(skFold.split(X_train, y_train)):
        test = y_train.iloc[val_idx]
        print(test.value_counts())
        
# 0    99
# 1    62
# Name: survived, dtype: int64
# 0    99
# 1    61
# Name: survived, dtype: int64
# 0    99
# 1    61
# Name: survived, dtype: int64
# 0    99
# 1    61
# Name: survived, dtype: int64
# 0    98
# 1    62
# Name: survived, dtype: int64

for i, (train_idx, val_idx) in enumerate(kFold.split(X_train, y_train)):
        test = y_train.iloc[val_idx]
        print(test.value_counts())
        
# Name: survived, dtype: int64
# 0    95
# 1    66
# Name: survived, dtype: int64
# 0    100
# 1     60
# Name: survived, dtype: int64
# 0    106
# 1     54
# Name: survived, dtype: int64
# 0    100
# 1     60
# Name: survived, dtype: int64
# 0    93
# 1    67
# Name: survived, dtype: int64

0    99
1    62
Name: survived, dtype: int64
0    99
1    61
Name: survived, dtype: int64
0    99
1    61
Name: survived, dtype: int64
0    99
1    61
Name: survived, dtype: int64
0    98
1    62
Name: survived, dtype: int64
0    95
1    66
Name: survived, dtype: int64
0    100
1     60
Name: survived, dtype: int64
0    106
1     54
Name: survived, dtype: int64
0    100
1     60
Name: survived, dtype: int64
0    93
1    67
Name: survived, dtype: int64


In [62]:
from sklearn.model_selection import cross_val_score

cross_val_score(estimator=dt_clf, X=X_train, y=y_train, cv=5)
# array([0.75776398, 0.7875    , 0.875     , 0.84375   , 0.85625   ])

# n_jobs : cpu 코어당 병렬로 작업을 가능하게 하는 옵션, 코어당 2개이상도 작업이 가능하다.
# cv : 폴드 개수, 몇개를 분활 할지 설정, 5개면 1/5로 분할

np.mean(cross_val_score(estimator=dt_clf, X=X_train, y=y_train, cv=5))
# 0.8253027950310559

0.8253027950310559

In [115]:
from sklearn.model_selection import LeaveOneOut
scores= []

# 행의 갯수만큼, 행을 학습 시킴.
# columns length * columns length
loocv = LeaveOneOut()
for train_idx, val_idx  in loocv.split(X_train) :
    X_train_loo = X_train.iloc[train_idx] # 학습지의 문제
    X_val_loo = X_train.iloc[val_idx] # 시험지의 문제
    y_train_loo = y_train.iloc[train_idx] # 학습지의 정답
    y_val_loo = y_train.iloc[val_idx] # 시험지의 정답
     
    dt_clf.fit(X_train_loo, y_train_loo) # 학습지의 문제와 정답을 보면서 학습
    scores.append(dt_clf.score(X_val_loo, y_val_loo)) # 시험지의 문제로 시험을 치루고 정답값과 비교
    
print("loocv 평균 학습 결과 : %.2f" %np.mean(scores))
# loocv 평균 학습 결과 : 0.83

loocv 평균 학습 결과 : 0.83


In [113]:
from sklearn.model_selection import LeavePOut
scores= []


# 행의 갯수만큼, 행을 학습 시킴.
# columns length * columns length
lpocv = LeavePOut(2)

for i, (train_idx, val_idx)  in enumerate(lpocv.split(X_train)) :
    if i == 500 :
        break
    X_train_lpo = X_train.iloc[train_idx] # 학습지의 문제
    X_val_lpo = X_train.iloc[val_idx] # 시험지의 문제
    y_train_lpo = y_train.iloc[train_idx] # 학습지의 정답
    y_val_lpo = y_train.iloc[val_idx] # 시험지의 정답
    
    dt_clf.fit(X_train_lpo, y_train_lpo) # 학습지의 문제와 정답을 보면서 학습
    scores.append(dt_clf.score(X_val_lpo, y_val_lpo)) # 시험지의 문제로 시험을 치루고 정답값과 비교
    
print("lpocv 평균 학습 결과 : %.2f" %np.mean(scores))

lpocv 평균 학습 결과 : 0.91


In [114]:
dt_clf.score(X_test, y_test)

0.8111111111111111

In [198]:
# ShuffleSplit(REPEATED RANDOM SUB SAMPLING VALIDATION) 검증 

from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(test_size=0.2, n_splits=5, random_state=14)

for i, (train_idx, val_idx)  in enumerate(ss.split(X_train)) :
    X_train_ss = X_train.iloc[train_idx] # 학습지의 문제
    X_val_ss = X_train.iloc[val_idx] # 시험지의 문제
    y_train_ss = y_train.iloc[train_idx] # 학습지의 정답
    y_val_ss = y_train.iloc[val_idx] # 시험지의 정답
    
    dt_clf.fit(X_train_ss, y_train_ss) # 학습지의 문제와 정답을 보면서 학습
    scores.append(dt_clf.score(X_val_ss, y_val_ss)) # 시험지의 문제로 시험을 치루고 정답값과 비교
    
    print(len(train_idx), ":" , len(val_idx))
    
print("sft 평균 학습 결과 : %.2f" %np.mean(scores))

# 720 : 81
# sft 평균 학습 결과 : 0.88

720 : 81
720 : 81
720 : 81
720 : 81
720 : 81
720 : 81
720 : 81
720 : 81
720 : 81
720 : 81
sft 평균 학습 결과 : 0.88


In [203]:
X = np.array([[1, 2], 
              [3, 4], 
             [1, 2], 
             [3, 4],
             [1, 2], 
             [3, 4]])
# y = np.array([1, 2, 3, 4, 5, 6])

from sklearn.model_selection import TimeSeriesSplit

ts = TimeSeriesSplit(n_splits=4)
for trn, val in ts.split(X) :
    print(trn,"\n",val)

[0 1] 
 [2]
[0 1 2] 
 [3]
[0 1 2 3] 
 [4]
[0 1 2 3 4] 
 [5]


In [215]:
dt_clf = DecisionTreeClassifier(max_depth=6)
dt_clf.fit(X_train, y_train)
dt_clf.score(X_test, y_test)



0.8

In [330]:
# params = {'max_dept' : [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, None],
#          'min_dept' : [range(1, 8)]}

params = {"criterion":['gini', 'entropy'],
        "splitter":['best', 'random'],
        "max_depth":range(3, 17),
        "min_samples_split":range(2, 20),
        "min_samples_leaf":range(1, 20),
        "max_features" : [None, "auto", "sqrt", "log2"]}

In [331]:
from sklearn.model_selection import GridSearchCV

gs_dtclf = GridSearchCV(dt_clf, param_grid=params, scoring='accuracy', 
                         n_jobs=-1, cv=5, verbose=1)

# verbose : 학습의과정 옵션
# n_jobs : 코어를 얼마나 사용할 것인가, -1을 하게되면 최대의 코어 사용
# cv(cross valueation) : 몇개를 자를것인가

In [332]:
%%time
gs_dtclf.fit(X_train, y_train)

Fitting 5 folds for each of 76608 candidates, totalling 383040 fits
CPU times: user 2min 31s, sys: 4.52 s, total: 2min 36s
Wall time: 4min 17s


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(max_depth=6), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(3, 17),
                         'max_features': [None, 'auto', 'sqrt', 'log2'],
                         'min_samples_leaf': range(1, 20),
                         'min_samples_split': range(2, 20),
                         'splitter': ['best', 'random']},
             scoring='accuracy', verbose=1)

In [354]:
best_params = gs_dtclf.best_params_
# {'criterion': 'gini',
#  'max_depth': 15,
#  'max_features': 'log2',
#  'min_samples_leaf': 1,
#  'min_samples_split': 8,
#  'splitter': 'random'}

In [342]:
from sklearn.tree import DecisionTreeClassifier
dt_clf_best = DecisionTreeClassifier(**best_params)

In [341]:
gs_dtclf.best_score_
# 0.8402639751552796
# test데이터가 아님

0.8452096273291925

In [349]:
dt_clf_best.fit(X, y)

TypeError: Singleton array array(3) cannot be considered a valid collection.

In [347]:
pred_result = dt_clf_best.predict(df_test)
pred_result_2 =  df_tsurvived_encoder.inverse_transform(pred_result)
df_test['survived'] = pred_result_2

In [363]:
# 파일 저장해서 캐글에 올리기.
tit = pd.read_csv('test.csv')
tit.drop(list(tit.columns)[1:], axis = 1, inplace=True)
tit['Survived'] = df_test['survived']
tit.set_index('PassengerId', inplace=True)
tit.to_csv('tit_test.csv')

In [326]:
finan_dtclf_2 = DecisionTreeClassifier(max_depth=5)

# fit = 머신러닝의 학습의 의미
finan_dtclf_2.fit(X, y)

DecisionTreeClassifier(max_depth=17)

In [328]:
pred_result = finan_dtclf_2.predict(df_test)
pred_result_2 =  df_tsurvived_encoder.inverse_transform(pred_result)
df_test['survived'] = pred_result_2

In [351]:
y

3

In [338]:
X = df_t.drop(columns='survived')
y = df_t['survived']

In [345]:
df_test

Unnamed: 0,pclass,sex,fare,age_new,survived
0,3,1,7.8292,2,0
1,3,0,7.0000,2,1
2,2,1,9.6875,1,0
3,3,1,8.6625,2,0
4,3,0,12.2875,2,1
...,...,...,...,...,...
413,3,1,8.0500,3,0
414,1,0,108.9000,2,1
415,3,1,7.2500,2,0
416,3,1,8.0500,3,0


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings(‘ignore’)
df = sns.load_dataset(‘titanic’)
df.drop([‘class’, ‘alive’, ‘embark_town’, ‘who’, ‘adult_male’, ‘alone’], axis=1, inplace=True)
df[‘family’] = df.sibsp + df.parch
df.drop([‘sibsp’, ‘parch’], axis=1, inplace=True)
df1 = df.copy()
df1.embarked.fillna(‘S’, inplace=True)
m1_med = df1.loc[(df1.sex == ‘male’) & (df1.pclass == 1), ‘age’].median()
m2_med = df1.loc[(df1.sex == ‘male’) & (df1.pclass == 2), ‘age’].median()
m3_med = df1.loc[(df1.sex == ‘male’) & (df1.pclass == 3), ‘age’].median()
f1_med = df1.loc[(df1.sex == ‘female’) & (df1.pclass == 1), ‘age’].median()
f2_med = df1.loc[(df1.sex == ‘female’) & (df1.pclass == 2), ‘age’].median()
f3_med = df1.loc[(df1.sex == ‘female’) & (df1.pclass == 3), ‘age’].median()
df1.loc[(df1.sex == ‘male’)&(df1.pclass == 1), ‘age’].fillna(29, inplace=True)
df1.loc[(df1.sex == ‘male’)  &(df1.pclass == 1)&(df1.age.isna()), ‘age’] = m1_med
df1.loc[(df1.sex == ‘male’)  &(df1.pclass == 2)&(df1.age.isna()), ‘age’] = m2_med
df1.loc[(df1.sex == ‘male’)  &(df1.pclass == 3)&(df1.age.isna()), ‘age’] = m3_med
df1.loc[(df1.sex == ‘female’)&(df1.pclass == 1)&(df1.age.isna()), ‘age’] = f1_med
df1.loc[(df1.sex == ‘female’)&(df1.pclass == 2)&(df1.age.isna()), ‘age’] = f2_med
df1.loc[(df1.sex == ‘female’)&(df1.pclass == 3)&(df1.age.isna()), ‘age’] = f3_med
df1.drop(‘deck’, axis=1, inplace=True)
df1.age_new = 0
# 노인의 생존율 (50세 이상) 유아의 생존율 (10세 미만)
df1.loc[df1.age >= 50, ‘age_new’] = ‘old’
df1.loc[(df1.age < 50) & (df1.age>=10), ‘age_new’] = ‘young’
df1.loc[df1.age < 10, ‘age_new’] = ‘baby’
for i in [‘sex’, ‘embarked’, ‘age_new’]:
    globals()[f’df1_{i}_encoder’] = LabelEncoder()
    globals()[f’df1_{i}_encoder’].fit(df1[i])
    df1[i] = globals()[f’df1_{i}_encoder’].transform(df1[i])
df1_sex_encoder = LabelEncoder()
df1_embarked_encoder = LabelEncoder()
df1_agenew_encoder = LabelEncoder()
df1_sex_encoder.fit(df1[‘sex’])
df1_embarked_encoder.fit(df1[‘embarked’])
df1_agenew_encoder.fit(df1[‘age_new’])
df1[‘sex’] = df1_sex_encoder.transform(df1[‘sex’])
df1[‘embarked’] = df1_embarked_encoder.transform(df1[‘embarked’])
df1[‘age_new’] = df1_agenew_encoder.transform(df1[‘age_new’])
X = df1.drop(‘survived’, axis=1)
y = df1.survived

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Tuning Multiple Hyperparameters
# read in hyperopt values
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
# redefine the function usng a wider range of hyperparameters
def objective(search_space):
    model = DecisionTreeClassifier(**search_space)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return {‘loss’: -accuracy, ‘status’: STATUS_OK}
# new search space
search_space={‘max_depth’:hp.choice(‘max_depth’, range(3, 17)),
              ‘min_samples_split’:hp.uniform(‘min_samples_split’, 0, 1),
              ‘min_samples_leaf’:hp.choice(‘min_samples_leaf’, range(1, 30)),
              ‘criterion’:hp.choice(‘criterion’, [‘gini’,‘entropy’]),
              ‘max_features’:hp.choice(‘max_features’, [None, ‘sqrt’, ‘log2’])}
# set the hyperparam tuning algorithm
algorithm=tpe.suggest
# implement Hyperopt
best_params = fmin(
    fn=objective,
    space=search_space,
    algo=algorithm,
    max_evals=100)
space_eval(search_space, best_params)
5:48
space_eval(search_space, best_params)
5:48
new_dtclf = DecisionTreeClassifier(**space_eval(search_space, best_params))
new_dtclf.fit(X_train, y_train)
5:48
new_dtclf.score(X_test, y_test)