In [78]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [70]:
# 요약 함수 모음

## train, test 불러오기
def load_traintest_dt():
    train = pd.read_csv('../data/titanic/train.csv')
    test = pd.read_csv('../data/titanic/test.csv')  

    return (train, test)

## submission_a 불러오기
def load_submission_a() :
    submission_a = pd.read_csv('../data/submission_a.csv')
    del submission_a['PassengerId']

    return submission_a

## 피처 삭제
def drop_col(train,test,col_list):
    '''
    drop train, test column
    parameter : train,test, list
    return train,test
    '''
    train = train.drop(col_list,axis=1)
    test = test.drop(col_list, axis=1)
    return (train,test)

## train, target 나누기
def split_data(train):
    train_data = train.drop('Survived',axis=1)
    target = train['Survived']

    return (train_data, target)

In [48]:
submission_a = load_submission_a()

In [49]:
train, test = load_traintest_dt()

In [91]:
# 데이터 전처리 함수

def Name_cleaning(train,test):
    
    # Name,Title
    train_test_data = [train,test]
    for dataset in train_test_data:
        dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.',expand=False)

    train_title_mapping = {'Mr':0, 'Miss':1, 'Mrs':2, 'Master':3
                    , 'Dr':4, 'Rev':4, 'Mlle':4, 'Major':4, 'Col':4
                    ,'Countess':4, 'Capt':4, 'Ms':4, 'Sir':4, 'Lady':4
                    , 'Mme':4, 'Don':4, 'Jonkheer':4
                    }
    train['Title'] = train['Title'].map(train_title_mapping)

    test_title_mapping = {'Mr':0, 'Miss':1, 'Mrs':2, 'Master':3
                    , 'Col':4, 'Rev':4, 'Ms':4, 'Dr':4, 'Dona':4
                    }

    test['Title'] = test['Title'].map(test_title_mapping)

    return (train,test)

def sex_cleaning(train,test):
    # sex
    sex_mapping = {'male':0, 'female':1}
    train_test_data = [train,test]
    for dataset in train_test_data:
        dataset['Sex'] = dataset['Sex'].map(sex_mapping)

    return (train,test)

def haveage_cleaning(train,test):
    # Have Age

    train.loc[train['Age'].isnull(), 'Null_Age'] = 0
    test.loc[test['Age'].isnull(), 'Null_Age'] = 0

    train.loc[train['Age'].notnull(), 'Null_Age'] = 1
    test.loc[test['Age'].notnull(), 'Null_Age'] = 1

    return (train,test)


def age_cleaning(train,test):
    # Age
    
    train['Age'].fillna(train.groupby('Title')['Age'].transform('mean'),inplace=True)
    test['Age'].fillna(test.groupby('Title')['Age'].transform('mean'),inplace=True)

    train_test_data = [train,test]
    for dataset in train_test_data:
        dataset.loc[dataset['Age'] <= 17, 'Age'] =0
        dataset.loc[(dataset['Age'] >17) & (dataset['Age'] <= 24), 'Age'] = 1
        dataset.loc[(dataset['Age'] >24) & (dataset['Age'] <= 34), 'Age'] = 2
        dataset.loc[(dataset['Age'] >34) & (dataset['Age'] <= 44), 'Age'] = 3
        dataset.loc[(dataset['Age'] >44) & (dataset['Age'] <= 60), 'Age'] = 4
        dataset.loc[dataset['Age'] >60, 'Age'] = 5

    return (train,test)


def embarked_cleaning(train,test):
    # Embarked

    train['Embarked'] = train['Embarked'].fillna('S')
    test['Embarked'] = test['Embarked'].fillna('S')

    embarked_mapping = {'S':0, 'C':1, 'Q':2}
    train['Embarked'] = train['Embarked'].map(embarked_mapping)
    test['Embarked'] = test['Embarked'].map(embarked_mapping)

    return (train,test)

def fare_cleaning(train,test):
    #Fare

    test['Fare'].fillna(
        test.groupby('Pclass')['Fare'].transform('median'), inplace=True
    )

    return (train,test)

def groupsize_cleanig(train,test):
    # group_size

    for ticket_num in train['Ticket'].unique():
        train.loc[train['Ticket']==ticket_num,'group_size'] = len(train[train['Ticket']==ticket_num])

    for ticket_num in test['Ticket'].unique():
        test.loc[test['Ticket']==ticket_num,'group_size'] = len(test[test['Ticket']==ticket_num])

    train['Fare'] = train['Fare']/train['group_size']
    test['Fare'] = test['Fare']/test['group_size']
    
    train_test_data = [train,test]
    for dataset in train_test_data:
        dataset.loc[dataset['group_size'] == 1, 'group_size'] = 0
        dataset.loc[dataset['group_size'] == 2, 'group_size'] = 0.4
        dataset.loc[(dataset['group_size'] == 3) | (dataset['group_size'] == 4), 'group_size'] = 0.8
        dataset.loc[dataset['group_size'] > 4, 'group_size'] = 1.2

    train_test_data = [train,test]
    for dataset in train_test_data:

        dataset.loc[dataset['Fare'] <= 7,'Fare'] = 0
        dataset.loc[(dataset['Fare'] >7) & (dataset['Fare'] <=8.8), 'Fare'] = 0.4
        dataset.loc[(dataset['Fare'] >8.8) & (dataset['Fare'] <=17), 'Fare'] = 0.8
        dataset.loc[(dataset['Fare'] >17) & (dataset['Fare'] <=30), 'Fare'] = 1.2
        dataset.loc[(dataset['Fare'] >30) & (dataset['Fare'] <=100), 'Fare'] = 1.6
        dataset.loc[dataset['Fare'] > 100,'Fare'] = 2
        
    return (train,test)

def havecabin_cleaning(train,test):
    # Have Cabin

    train.loc[train['Cabin'].isnull(), 'Null_Cabin'] = 0
    test.loc[test['Cabin'].isnull(), 'Null_Cabin'] = 0

    train.loc[train['Cabin'].notnull(), 'Null_Cabin'] = 1
    test.loc[test['Cabin'].notnull(), 'Null_Cabin'] = 1

    return (train,test)

def cabin_cleaning(train,test):
    # Cabin

    train['Cabin'] = train['Cabin'].str[:1]
    test['Cabin'] = test['Cabin'].str[:1]

    cabin_mapping = {"A": 0, "B": 0.4
                    , "C": 0.8, "D": 1.2
                    , "E": 1.6, "F": 2, "G": 2.4
                    , 'T' :2.8
                    }

    train['Cabin'] = train['Cabin'].map(cabin_mapping)
    test['Cabin'] = test['Cabin'].map(cabin_mapping)

    train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
    test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True)

    return (train,test)

def familysize_cleaning(train,test):
    # FamilySize

    train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
    test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

    family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
    train['FamilySize'] = train['FamilySize'].map(family_mapping)
    test['FamilySize'] = test['FamilySize'].map(family_mapping)

    return (train,test)

def data_scaler(train,test):
    # 정규화

    columns = ['Pclass', 'Sex', 'Age', 'Fare', 'Cabin',
        'Embarked', 'Title', 'Null_Age', 'group_size', 'Null_Cabin',
        'FamilySize']
 
    train_test_data = [train,test]
    for dataset in train_test_data:
        scaler = MinMaxScaler()
        # scaler = StandardScaler()

        scaler.fit(dataset[columns])
        scaled = scaler.transform(dataset[columns])

        df_scaled = pd.DataFrame(data=scaled, columns=columns)
        dataset.loc[:,columns] = df_scaled

    return (train,test)

In [92]:
# 전처리 전체 실행

def data_cleaning():
    train,test = load_traintest_dt()

    #name
    train,test = Name_cleaning(train,test)
    #sex
    train,test = sex_cleaning(train,test)
    #haveage
    train,test = haveage_cleaning(train,test)
    #age
    train,test = age_cleaning(train,test)
    #embarked
    train,test = embarked_cleaning(train,test)
    #fare
    train,test = fare_cleaning(train,test)
    #groupsize
    train,test = groupsize_cleanig(train,test)
    #havecabin
    train,test = havecabin_cleaning(train,test)
    #cabin
    train,test = cabin_cleaning(train,test)
    #familysize
    train,test = familysize_cleaning(train,test)
    
    #scaler
    train,test = data_scaler(train,test)

    #drop
    train,test = drop_col(train,test,['Ticket','SibSp','Parch','Name'])

    #split

    train_data, target = split_data(train)

    return (train_data,target,test)

In [93]:
train_data, target, test = data_cleaning()

In [94]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,Null_Age,group_size,Null_Cabin,FamilySize
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,0.632775,0.363636,0.4,0.432057,0.702951,0.232057,0.187201,0.794258,0.106061,0.217703,0.083971
std,120.810458,0.420919,0.481622,0.248545,0.248209,0.234738,0.342758,0.254641,0.404727,0.214503,0.413179,0.151907
min,892.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,996.25,0.0,0.0,0.2,0.2,0.666667,0.0,0.0,1.0,0.0,0.0,0.0
50%,1100.5,1.0,0.0,0.4,0.4,0.833333,0.0,0.0,1.0,0.0,0.0,0.0
75%,1204.75,1.0,1.0,0.6,0.6,0.833333,0.5,0.25,1.0,0.0,0.0,0.1
max,1309.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [95]:
train_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,Null_Age,group_size,Null_Cabin,FamilySize
0,1,1.0,0,0.2,0.2,0.714286,0.0,0.0,1.0,0.0,0.0,0.1
1,2,0.0,1,0.6,0.8,0.285714,0.5,0.5,1.0,0.0,1.0,0.1
2,3,1.0,1,0.4,0.2,0.714286,0.0,0.25,1.0,0.0,0.0,0.0
3,4,0.0,1,0.6,0.6,0.285714,0.0,0.5,1.0,0.333333,1.0,0.1
4,5,1.0,0,0.6,0.2,0.714286,0.0,0.0,1.0,0.0,0.0,0.0


## Name | Title

In [None]:
train_test_data = [train,test]

for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.',expand=False)

In [None]:
# Let's replace the titles with only the simple ones.

for df in train_test_data :
    male_dr_filter = (df.Title == 'Dr') & (df.Sex == 'male')
    female_dr_filter = (df.Title == 'Dr') & (df.Sex == 'female')
    df.loc[male_dr_filter, ['Title']] = 'Mr'
    df.loc[female_dr_filter, ['Title']] = 'Mrs'


In [None]:
train['Title'].unique()

In [None]:

train_title_mapping = {'Mr':0, 'Miss':1, 'Mrs':2, 'Master':3
                , 'Dr':4, 'Rev':4, 'Mlle':4, 'Major':4, 'Col':4
                ,'Countess':4, 'Capt':4, 'Ms':4, 'Sir':4, 'Lady':4
                , 'Mme':4, 'Don':4, 'Jonkheer':4
                }
train['Title'] = train['Title'].map(train_title_mapping)

test_title_mapping = {'Mr':0, 'Miss':1, 'Mrs':2, 'Master':3
                , 'Col':4, 'Rev':4, 'Ms':4, 'Dr':4, 'Dona':4
                }

test['Title'] = test['Title'].map(test_title_mapping)

## Sex


In [None]:
sex_mapping = {'male':0, 'female':1}

for dataset in train_test_data:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

## Have Age

In [None]:
train.loc[train['Age'].isnull(), 'Null_Age'] = 0
test.loc[test['Age'].isnull(), 'Null_Age'] = 0

train.loc[train['Age'].notnull(), 'Null_Age'] = 1
test.loc[test['Age'].notnull(), 'Null_Age'] = 1


## Age

In [None]:
# fill missing age with median age for each title(Mr,Mrs,Miss,Master,Others)
# Age 결측치 해당 Title의 나이의 중앙값으로 채우기

train['Age'].fillna(train.groupby('Title')['Age'].transform('mean'),inplace=True)
test['Age'].fillna(test.groupby('Title')['Age'].transform('mean'),inplace=True)

In [None]:
for dataset in train_test_data:
    dataset.loc[dataset['Age'] <= 17, 'Age'] =0
    dataset.loc[(dataset['Age'] >17) & (dataset['Age'] <= 24), 'Age'] = 1
    dataset.loc[(dataset['Age'] >24) & (dataset['Age'] <= 34), 'Age'] = 2
    dataset.loc[(dataset['Age'] >34) & (dataset['Age'] <= 44), 'Age'] = 3
    dataset.loc[(dataset['Age'] >44) & (dataset['Age'] <= 60), 'Age'] = 4
    dataset.loc[dataset['Age'] >60, 'Age'] = 5

train.head()

## Embarked

In [None]:
# 대부분 S embark 에서 탐 => fillna('S')

train['Embarked'] = train['Embarked'].fillna('S')
test['Embarked'] = test['Embarked'].fillna('S')

In [None]:
embarked_mapping = {'S':0, 'C':1, 'Q':2}
train['Embarked'] = train['Embarked'].map(embarked_mapping)
test['Embarked'] = test['Embarked'].map(embarked_mapping)

## Fare

In [None]:
test['Fare'].fillna(
    test.groupby('Pclass')['Fare'].transform('median'), inplace=True
)

In [None]:
train['Fare'].isnull().sum(), test['Fare'].isnull().sum()

## group_size

In [None]:
for ticket_num in train['Ticket'].unique():
    train.loc[train['Ticket']==ticket_num,'group_size'] = len(train[train['Ticket']==ticket_num])

for ticket_num in test['Ticket'].unique():
    test.loc[test['Ticket']==ticket_num,'group_size'] = len(test[test['Ticket']==ticket_num])

In [None]:
train['Fare'] = train['Fare']/train['group_size']
test['Fare'] = test['Fare']/test['group_size']

In [None]:
train_test_data = [train,test]

for dataset in train_test_data:
    dataset.loc[dataset['group_size'] == 1, 'group_size'] = 0
    dataset.loc[dataset['group_size'] == 2, 'group_size'] = 0.4
    dataset.loc[(dataset['group_size'] == 3) | (dataset['group_size'] == 4), 'group_size'] = 0.8
    dataset.loc[dataset['group_size'] > 4, 'group_size'] = 1.2


In [None]:
# from sklearn.preprocessing import StandardScaler
# # 사이킷런의 StandardScaler를 이용하여 정규분포 형태로 피처값 변환하는 로직으로 수정. 
# def get_preprocessed_df(df=None):
#     df_copy = df.copy()
#     scaler = StandardScaler()
#     amount_n = scaler.fit_transform(df_copy['Fare'].values.reshape(-1, 1))
#     # 피처명 변경후 DataFrame맨 앞 컬럼으로 입력
#     df_copy.insert(0, 'Fare_Scaled', amount_n)
#     # 기존 피처 삭제
#     df_copy.drop(['Fare'], axis=1, inplace=True)
#     return df_copy

# train = get_preprocessed_df(train)
# test =  get_preprocessed_df(test)


In [None]:
# train['Fare_Scaled'].max(),train['Fare_Scaled'].min()

In [None]:
# train['Fare_Scaled'].describe()

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# plt.figure(figsize=(8, 4))
# # plt.xticks(range(0, 3000, 1000), rotation=60)
# sns.histplot(train['Fare_Scaled'], kde=True)
# plt.show()

In [None]:
# train_test_data = [train,test]
# for dataset in train_test_data:

#     dataset.loc[dataset['Fare_Scaled'] <= -4.5,'Fare_Scaled'] = 0
#     dataset.loc[(dataset['Fare_Scaled'] >-4.5) & (dataset['Fare_Scaled'] <=-4.2), 'Fare_Scaled'] = 0.4
#     dataset.loc[(dataset['Fare_Scaled'] >-4.2) & (dataset['Fare_Scaled'] <=0), 'Fare_Scaled'] = 0.8
#     dataset.loc[(dataset['Fare_Scaled'] >0) & (dataset['Fare_Scaled'] <=1), 'Fare_Scaled'] = 1.2
#     dataset.loc[(dataset['Fare_Scaled'] >1) & (dataset['Fare_Scaled'] <=4), 'Fare_Scaled'] = 1.6
#     dataset.loc[dataset['Fare_Scaled'] > 4,'Fare_Scaled'] = 2


In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# plt.figure(figsize=(8, 4))
# # plt.xticks(range(0, 30000, 1000), rotation=60)
# sns.histplot(train['Fare'], kde=True)
# plt.show()


In [None]:
train['Fare'].describe()

In [None]:
train_test_data = [train,test]
for dataset in train_test_data:

    dataset.loc[dataset['Fare'] <= 7,'Fare'] = 0
    dataset.loc[(dataset['Fare'] >7) & (dataset['Fare'] <=8.8), 'Fare'] = 0.4
    dataset.loc[(dataset['Fare'] >8.8) & (dataset['Fare'] <=17), 'Fare'] = 0.8
    dataset.loc[(dataset['Fare'] >17) & (dataset['Fare'] <=30), 'Fare'] = 1.2
    # dataset.loc[dataset['Fare']>30, 'Fare'] = 1.6
    dataset.loc[(dataset['Fare'] >30) & (dataset['Fare'] <=100), 'Fare'] = 1.6
    dataset.loc[dataset['Fare'] > 100,'Fare'] = 2


In [None]:
train['Fare'].isnull().sum()

## Have Cabin

In [None]:
train.loc[train['Cabin'].isnull(), 'Null_Cabin'] = 0
test.loc[test['Cabin'].isnull(), 'Null_Cabin'] = 0

train.loc[train['Cabin'].notnull(), 'Null_Cabin'] = 1
test.loc[test['Cabin'].notnull(), 'Null_Cabin'] = 1


In [None]:
train['Cabin'].isnull().sum()

In [None]:
train['Null_Cabin'].unique()

## Cabin_num

In [None]:
# train['Cabin'].str.extract('([0-9]+)',expand=False)

In [None]:
# train['Cabin_num'] = train['Cabin'].str.extract('([0-9]+)',expand=False)
# test['Cabin_num'] = test['Cabin'].str.extract('([0-9]+)',expand=False)

In [None]:
# train['Cabin_num'] = train[train['Cabin_num'].notnull()]['Cabin_num'].astype(int)
# test['Cabin_num'] = test[test['Cabin_num'].notnull()]['Cabin_num'].astype(int)

In [None]:
# train["Cabin_num"].fillna(train.groupby("Pclass")["Cabin_num"].transform("median"), inplace=True)
# test["Cabin_num"].fillna(test.groupby("Pclass")["Cabin_num"].transform("median"), inplace=True)

In [None]:
# train.loc[train['Cabin_num'] <= 22,'Cabin_num'] = 0
# train.loc[(train['Cabin_num'] >22) & (train['Cabin_num'] <=43), 'Cabin_num'] = 1
# train.loc[(train['Cabin_num'] >43) & (train['Cabin_num'] <=70), 'Cabin_num'] = 2
# train.loc[train['Cabin_num'] > 70,'Cabin_num'] = 3

In [None]:
# test.loc[test['Cabin_num'] <= 22,'Cabin_num'] = 0
# test.loc[(test['Cabin_num'] >22) & (test['Cabin_num'] <=43), 'Cabin_num'] = 1
# test.loc[(test['Cabin_num'] >43) & (test['Cabin_num'] <=70), 'Cabin_num'] = 2
# test.loc[test['Cabin_num'] > 70,'Cabin_num'] = 3

In [None]:
train['Cabin'].isnull().sum()

## Cabin

In [None]:
train['Cabin'] = train['Cabin'].str[:1]
test['Cabin'] = test['Cabin'].str[:1]

In [None]:
# train.drop(index=train[train['Cabin']=='T'].index,axis=0,inplace=True)

In [None]:
cabin_mapping = {"A": 0, "B": 0.4
                 , "C": 0.8, "D": 1.2
                 , "E": 1.6, "F": 2, "G": 2.4
                 , 'T' :2.8
                 }

train['Cabin'] = train['Cabin'].map(cabin_mapping)
test['Cabin'] = test['Cabin'].map(cabin_mapping)

In [None]:
train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True)

In [None]:
train.isnull().sum()

## FamilySize

In [None]:
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [None]:
family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
train['FamilySize'] = train['FamilySize'].map(family_mapping)
test['FamilySize'] = test['FamilySize'].map(family_mapping)

## Zero_Fam

In [None]:
# train.loc[train['FamilySize']==0, 'Zero_Fam'] = 0
# test.loc[train['FamilySize']==0, 'Zero_Fam'] = 0

# train.loc[train['FamilySize']>0, 'Zero_Fam'] = 1
# test.loc[train['FamilySize']>0, 'Zero_Fam'] = 1


In [None]:
train.columns

## 정규화

In [None]:
train['Fare'].isnull().sum()

In [None]:
train['PassengerId']

In [None]:
# test_df = pd.get_dummies(test_df,columns=['Title', 'FamilySize'], drop_first=True)
# train_df = pd.get_dummies(train_df,columns=['Title', 'FamilySize'], drop_first=True)

In [None]:
columns = ['Pclass', 'Sex', 'Age', 'Fare', 'Cabin',
       'Embarked', 'Title', 'Null_Age', 'group_size', 'Null_Cabin',
       'FamilySize']
# 객체 생성
scaler = MinMaxScaler()
# 데이터 셑 변환, fit(), transform()
scaler.fit(train[columns])
scaled = scaler.transform(train[columns])

#transforma()시 스케일 변환된 데이터 세트가 ndarray로 반환돼 이를 DataFrame으로 변환
df_scaled = pd.DataFrame(data=scaled, columns=columns)

print('최솟값')             # 0에 가까워짐
print(df_scaled.min())
print('\n최댓값')
print(df_scaled.max()) 

In [None]:
train.loc[:,columns] = df_scaled

In [None]:
columns = ['Pclass', 'Sex', 'Age', 'Fare', 'Cabin',
       'Embarked', 'Title', 'Null_Age', 'group_size', 'Null_Cabin',
       'FamilySize']
# 객체 생성
scaler = MinMaxScaler()
# 데이터 셑 변환, fit(), transform()
scaler.fit(test[columns])
scaled = scaler.transform(test[columns])

#transforma()시 스케일 변환된 데이터 세트가 ndarray로 반환돼 이를 DataFrame으로 변환
df_scaled = pd.DataFrame(data=scaled, columns=columns)

print('최솟값')             # 0에 가까워짐
print(df_scaled.min())
print('\n최댓값')
print(df_scaled.max()) 

In [None]:
test.loc[:,columns] = df_scaled

In [None]:
train['Fare'].isnull().sum()

---

## modeling

In [None]:


plt.figure(figsize=(6, 6))
corr = train.corr()
sns.heatmap(corr, cmap='RdBu')

In [None]:
train.describe()

In [None]:
# # RandomForest
# parameters = {'n_estimators':[400,500]
#               ,'max_depth':[4,6,8]
#               ,'min_samples_split':[2,4,6]
#               ,'min_samples_leaf' : [2,4,6]
#               , 'random_state':[32,49]
#               }

# rf_clf = RandomForestClassifier()

# grid_clf = GridSearchCV(rf_clf,param_grid=parameters,scoring='accuracy',cv=5,n_jobs=-1)

# grid_clf.fit(train_data,target)
# print(grid_clf.best_params_)
# print(grid_clf.best_score_)



In [None]:
clf = RandomForestClassifier(n_estimators=650
                             , max_depth=5
                            #  ,min_samples_split=2
                             ,min_samples_leaf=14
                             , random_state=1)
clf.fit(train_data, target)

prediction = clf.predict(test)

score = cross_val_score(clf, train_data, target, cv=5, n_jobs=-1, scoring='accuracy')
print(score.mean())

## testing

### XGBoost

In [None]:
# import xgboost as xgb
# from xgboost import plot_importance
# import pandas as pd
# import numpy as np

# from sklearn.model_selection import train_test_split
# import warnings
# warnings.filterwarnings('ignore')

In [None]:
# # 불균형 데이터셋

# from sklearn.metrics import confusion_matrix, accuracy_score
# from sklearn.metrics import precision_score, recall_score
# from sklearn.metrics import f1_score, roc_auc_score

# def get_clf_eval(y_test, pred=None, pred_proba=None):
#     confusion = confusion_matrix( y_test, pred)
#     accuracy = accuracy_score(y_test , pred)
#     precision = precision_score(y_test , pred)
#     recall = recall_score(y_test , pred)
#     f1 = f1_score(y_test,pred)
#     # ROC-AUC 추가 
#     roc_auc = roc_auc_score(y_test, pred_proba)
#     print('오차 행렬')
#     print(confusion)
#     # ROC-AUC print 추가
#     print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
#     F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))


In [None]:
# from xgboost import XGBClassifier

# parameters = {'n_estimators':[900,1000]
#             , 'learning_rate':[0.001,0.002,0.003]
#             , 'max_depth':[4,5]
#             , 'reg_lambda':[3,4,5]
#             , 'random_state':[1]
#                             }

# xgb_wrapper = XGBClassifier()

# grid_clf = GridSearchCV(xgb_wrapper,param_grid=parameters,scoring='accuracy',cv=5,n_jobs=-1)


# grid_clf.fit(train_data, target
#                 # , early_stopping_rounds=50
#                 # , eval_metric="logloss"
#                 # , eval_set=evals
#                 , verbose=False  # 결과 추출
#                 )

# print(grid_clf.best_params_)
# print(grid_clf.best_score_)

In [None]:
# clf = XGBClassifier(learning_rate=0.002
#                              , max_depth=4
#                              , n_estimators=900
#                              , reg_lambda=3
#                              , random_state=1
#                              , n_jobs=-1)
# clf.fit(train_data, target
#         , verbose=False  # 결과 추출
#         )

# prediction = clf.predict(test)

In [None]:
# acc_list=[]

# for r in range(1,100):
#     clf = XGBClassifier(learning_rate=(r/100000)
#                         , max_depth=4
#                         , n_estimators=900
#                         , reg_lambda=15
                      
#                         )
#     clf.fit(train_data, target
#             , verbose=False  # 결과 추출
#             )
#     Y_pred = clf.predict(test) # 테스트 데이터로 예측값 추출    
    
#     accuracy = accuracy_score(Y_pred, submission_a)
#     print(r, accuracy)
#     acc_list.append(accuracy)

# acc_list.index(max(acc_list)),max(acc_list)

In [None]:
# accuracy = accuracy_score(prediction, submission_a)

# accuracy

### randomforest

In [77]:
acc_list=[]

for r in range(2,20):
    clf = RandomForestClassifier(n_estimators=280   
                                 , max_depth=4
                                #  , min_samples_split=r
                                 , min_samples_leaf=r
                                 , random_state=1
                                 , n_jobs=-1)
    clf.fit(train_data, target) # 학습
    Y_pred = clf.predict(test) # 테스트 데이터로 예측값 추출    
    
    accuracy = accuracy_score(Y_pred, submission_a)
    print(r, accuracy)
    acc_list.append(accuracy)

acc_list.index(max(acc_list)),max(acc_list)

2 0.7511961722488039
3 0.7607655502392344
4 0.7631578947368421
5 0.7607655502392344
6 0.7607655502392344
7 0.7607655502392344
8 0.7631578947368421
9 0.7631578947368421
10 0.7607655502392344
11 0.7607655502392344
12 0.7607655502392344
13 0.7607655502392344
14 0.7607655502392344
15 0.7607655502392344
16 0.7607655502392344
17 0.7607655502392344
18 0.7607655502392344
19 0.7655502392344498


(17, 0.7655502392344498)

In [None]:
clf = RandomForestClassifier(n_estimators=122
                             , max_depth=5
                             #,min_samples_split=6
                             ,min_samples_leaf=18
                             , random_state=793
                             ,n_jobs=-1)
clf.fit(train_data, target)

prediction = clf.predict(test)

accuracy = accuracy_score(prediction, submission_a)

accuracy

In [75]:
train_data, target, test = data_cleaning()

In [96]:
clf = RandomForestClassifier(n_estimators=660
                             , max_depth=5
                             #,min_samples_split=6
                             ,min_samples_leaf=14
                             , random_state=1
                             ,n_jobs=-1)
clf.fit(train_data, target)

prediction = clf.predict(test)

accuracy = accuracy_score(prediction, submission_a)

accuracy

0.8157894736842105

In [None]:
# feature importance 추출

print("Feature importances:\n{0}".format(np.round(clf.feature_importances_,3)))

# feature 별 importance 매핑

for name, value in zip(train_data.columns
                       ,clf.feature_importances_):
    print('{0} : {1:.3f}'.format(name, value))

In [None]:
from hyperopt import hp

search_space = {'max_depth': hp.quniform('max_depth', 1,30,1), 
                'random_state': hp.quniform('random_state', 1,1000,1),
                'n_estimators': hp.quniform('n_estimators', 1,1000,1),
                'reg_lambda':hp.quniform('reg_lambda',1,30,1),
                'learning_rate':hp.quniform('learning_rate',0.001,0.1,0.001)
                #'scale_pos_weight':hp.quniform('scale_pos_weight',1,30,1)
                }

In [None]:
def objective_func(search_space):
    xgb_clf = XGBClassifier(n_estimators=int(search_space['n_estimators'])
                            , max_depth=int(search_space['max_depth'])
                            ,random_state=int(search_space['random_state'])
                            ,reg_lambda=int(search_space['reg_lambda'])
                            ,learning_rate=float(search_space['learning_rate'])
                            #,scale_pos_weight=int(search_space['scale_pos_weight'])
                           )
    roc_auc_list= []
    xgb_clf.fit(train_data , target)
    pred = xgb_clf.predict(test)
    submission_a = pd.read_csv('../data/submission_a.csv')
    del submission_a['PassengerId']
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(pred, submission_a)
    roc_auc_list.append(accuracy)
    return -1 * np.mean(roc_auc_list)

In [None]:
from hyperopt import fmin, tpe, Trials

trials = Trials()

# fmin()함수를 호출. max_evals지정된 횟수만큼 반복 후 목적함수의 최소값을 가지는 최적 입력값 추출.
best = fmin(fn=objective_func,
            space=search_space,
            algo=tpe.suggest,
            max_evals=1000, # 최대 반복 횟수를 지정합니다.
            trials=trials, 
            rstate=np.random.default_rng(seed=30))

print('best:', best)

In [None]:
submission = pd.DataFrame({

    'PassengerId':test['PassengerId']
    ,'Survived':Y_pred
    
})
submission.to_csv('submission.csv',index=False)