In [79]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [80]:
# 요약 함수 모음

## train, test 불러오기
def load_traintest_dt():
    train = pd.read_csv('../data/titanic/train.csv')
    test = pd.read_csv('../data/titanic/test.csv')  

    return (train, test)

## submission_a 불러오기
def load_submission_a() :
    submission_a = pd.read_csv('../data/submission_a.csv')
    del submission_a['PassengerId']

    return submission_a

## 피처 삭제
def drop_col(train,test,col_list):
    '''
    drop train, test column
    parameter : train,test, list
    return train,test
    '''
    train = train.drop(col_list,axis=1)
    test = test.drop(col_list, axis=1)
    return (train,test)

## train, target 나누기
def split_data(train):
    train_data = train.drop('Survived',axis=1)
    target = train['Survived']

    return (train_data, target)

In [81]:
# 데이터 전처리 함수

    # Name,Title
def Name_cleaning(train,test):
    
    train_test_data = [train,test]
    for dataset in train_test_data:
        dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.',expand=False)

    train_title_mapping = {'Mr':0, 'Miss':1, 'Mrs':2, 'Master':3
                    , 'Dr':4, 'Rev':4, 'Mlle':4, 'Major':4, 'Col':4
                    ,'Countess':4, 'Capt':4, 'Ms':4, 'Sir':4, 'Lady':4
                    , 'Mme':4, 'Don':4, 'Jonkheer':4
                    }
    train['Title'] = train['Title'].map(train_title_mapping)

    test_title_mapping = {'Mr':0, 'Miss':1, 'Mrs':2, 'Master':3
                    , 'Col':4, 'Rev':4, 'Ms':4, 'Dr':4, 'Dona':4
                    }

    test['Title'] = test['Title'].map(test_title_mapping)

    return (train,test)

    # sex
def sex_cleaning(train,test):
    sex_mapping = {'male':0, 'female':1}
    train_test_data = [train,test]
    for dataset in train_test_data:
        dataset['Sex'] = dataset['Sex'].map(sex_mapping)

    return (train,test)

    # Have Age
def haveage_cleaning(train,test):

    train.loc[train['Age'].isnull(), 'Null_Age'] = 0
    test.loc[test['Age'].isnull(), 'Null_Age'] = 0

    train.loc[train['Age'].notnull(), 'Null_Age'] = 1
    test.loc[test['Age'].notnull(), 'Null_Age'] = 1

    return (train,test)


    # Age
def age_cleaning(train,test):
    
    train['Age'].fillna(train.groupby('Title')['Age'].transform('mean'),inplace=True)
    test['Age'].fillna(test.groupby('Title')['Age'].transform('mean'),inplace=True)

    train_test_data = [train,test]
    for dataset in train_test_data:
        dataset.loc[dataset['Age'] <= 17, 'Age'] =0
        dataset.loc[(dataset['Age'] >17) & (dataset['Age'] <= 24), 'Age'] = 1
        dataset.loc[(dataset['Age'] >24) & (dataset['Age'] <= 34), 'Age'] = 2
        dataset.loc[(dataset['Age'] >34) & (dataset['Age'] <= 44), 'Age'] = 3
        dataset.loc[(dataset['Age'] >44) & (dataset['Age'] <= 60), 'Age'] = 4
        dataset.loc[dataset['Age'] >60, 'Age'] = 5

    return (train,test)


    # Embarked
def embarked_cleaning(train,test):

    train['Embarked'] = train['Embarked'].fillna('S')
    test['Embarked'] = test['Embarked'].fillna('S')

    embarked_mapping = {'S':0, 'C':1, 'Q':2}
    train['Embarked'] = train['Embarked'].map(embarked_mapping)
    test['Embarked'] = test['Embarked'].map(embarked_mapping)

    return (train,test)

    #Fare
def fare_cleaning(train,test):

    test['Fare'].fillna(
        test.groupby('Pclass')['Fare'].transform('median'), inplace=True
    )
    #############################################################################
    
    train.loc[train['Fare'] == 0, 'Zero_Fare'] = 0
    test.loc[test['Fare'] == 0, 'Zero_Fare'] = 0

    train.loc[train['Fare'] != 0, 'Zero_Fare'] = 1
    test.loc[test['Fare'] != 0, 'Zero_Fare'] = 1


    ##############################################################################
    
    return (train,test)

    # group_size
def groupsize_cleanig(train,test):

    for ticket_num in train['Ticket'].unique():
        train.loc[train['Ticket']==ticket_num,'group_size'] = len(train[train['Ticket']==ticket_num])

    for ticket_num in test['Ticket'].unique():
        test.loc[test['Ticket']==ticket_num,'group_size'] = len(test[test['Ticket']==ticket_num])

    train['Fare'] = train['Fare']/train['group_size']
    test['Fare'] = test['Fare']/test['group_size']
    
    train_test_data = [train,test]
    for dataset in train_test_data:
        dataset.loc[dataset['group_size'] == 1, 'group_size'] = 0
        dataset.loc[dataset['group_size'] == 2, 'group_size'] = 0.4
        dataset.loc[(dataset['group_size'] == 3) | (dataset['group_size'] == 4), 'group_size'] = 0.8
        dataset.loc[dataset['group_size'] > 4, 'group_size'] = 1.2

    train_test_data = [train,test]
    for dataset in train_test_data:

        dataset.loc[dataset['Fare'] <= 7,'Fare'] = 0
        dataset.loc[(dataset['Fare'] >7) & (dataset['Fare'] <=8.8), 'Fare'] = 0.4
        dataset.loc[(dataset['Fare'] >8.8) & (dataset['Fare'] <=17), 'Fare'] = 0.8
        dataset.loc[(dataset['Fare'] >17) & (dataset['Fare'] <=30), 'Fare'] = 1.2
        dataset.loc[(dataset['Fare'] >30) & (dataset['Fare'] <=100), 'Fare'] = 1.6
        dataset.loc[dataset['Fare'] > 100,'Fare'] = 2
        
    return (train,test)

    # Have Cabin
def havecabin_cleaning(train,test):

    train.loc[train['Cabin'].isnull(), 'Null_Cabin'] = 0
    test.loc[test['Cabin'].isnull(), 'Null_Cabin'] = 0

    train.loc[train['Cabin'].notnull(), 'Null_Cabin'] = 1
    test.loc[test['Cabin'].notnull(), 'Null_Cabin'] = 1

    return (train,test)

    # Cabin
def cabin_cleaning(train,test):

    train['Cabin'] = train['Cabin'].str[:1]
    test['Cabin'] = test['Cabin'].str[:1]

    cabin_mapping = {"A": 0, "B": 0.4
                    , "C": 0.8, "D": 1.2
                    , "E": 1.6, "F": 2, "G": 2.4
                    , 'T' :2.8
                    }

    train['Cabin'] = train['Cabin'].map(cabin_mapping)
    test['Cabin'] = test['Cabin'].map(cabin_mapping)

    train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
    test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True)

    return (train,test)

    # FamilySize
def familysize_cleaning(train,test):

    train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
    test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

    family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
    train['FamilySize'] = train['FamilySize'].map(family_mapping)
    test['FamilySize'] = test['FamilySize'].map(family_mapping)

    return (train,test)

    # 정규화
def data_scaler(train,test):

    columns = ['Pclass', 'Sex', 'Age', 'Fare', 'Cabin',
        'Embarked', 'Title'
        , 'group_size'
        ,'FamilySize','Family_Survival'
        # , 'Null_Cabin'
        # , 'Null_Age'
        # ,'Zero_Fare'
        ]
 
    train_test_data = [train,test]
    for dataset in train_test_data:
        scaler = MinMaxScaler()
        # scaler = StandardScaler()

        scaler.fit(dataset[columns])
        scaled = scaler.transform(dataset[columns])

        df_scaled = pd.DataFrame(data=scaled, columns=columns)
        dataset.loc[:,columns] = df_scaled

    return (train,test)

In [82]:
# 전처리 전체 실행

def data_cleaning():
    train,test = load_traintest_dt()

    #name
    train,test = Name_cleaning(train,test)
    #sex
    train,test = sex_cleaning(train,test)
    #haveage
    train,test = haveage_cleaning(train,test)
    #age
    train,test = age_cleaning(train,test)
    #embarked
    train,test = embarked_cleaning(train,test)
    #fare
    train,test = fare_cleaning(train,test)
    #groupsize
    train,test = groupsize_cleanig(train,test)
    #havecabin
    train,test = havecabin_cleaning(train,test)
    #cabin
    train,test = cabin_cleaning(train,test)
    #familysize
    train,test = familysize_cleaning(train,test)
    
    #scaler
    train,test = data_scaler(train,test)

    #drop
    train,test = drop_col(train,test,['Ticket','SibSp','Parch','Name'])

    #split

    train_data, target = split_data(train)

    return (train_data,target,test)

In [83]:
# train,test = load_traintest_dt()

# #name
# train,test = Name_cleaning(train,test)
# #sex
# train,test = sex_cleaning(train,test)
# #haveage
# train,test = haveage_cleaning(train,test)
# #age
# train,test = age_cleaning(train,test)
# #embarked
# train,test = embarked_cleaning(train,test)
# #fare
# train,test = fare_cleaning(train,test)
# #groupsize
# train,test = groupsize_cleanig(train,test)
# #havecabin
# train,test = havecabin_cleaning(train,test)
# #cabin
# train,test = cabin_cleaning(train,test)
# #familysize
# train,test = familysize_cleaning(train,test)

In [84]:
train,test = load_traintest_dt()
data_df = pd.concat([train,test])
#name
train,test = Name_cleaning(train,test)
#sex
train,test = sex_cleaning(train,test)
#groupsize
train,test = groupsize_cleanig(train,test)
#haveage
train,test = haveage_cleaning(train,test)
#age
train,test = age_cleaning(train,test)
#embarked
train,test = embarked_cleaning(train,test)
#fare
train,test = fare_cleaning(train,test)
#havecabin
train,test = havecabin_cleaning(train,test)
#cabin
train,test = cabin_cleaning(train,test)
#familysize
train,test = familysize_cleaning(train,test)

In [85]:


data_df['Lastname'] = data_df['Name'].apply(lambda x: str.split(x, ",")[0])
# data_df['Fare'].fillna(data_df['Fare'].mean(), inplace=True)

DEFAULT_SURVIVAL_VALUE = 0.5
data_df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in data_df[['Survived','Name', 'Lastname', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Lastname', 'Fare']):
    
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0

print("Number of passengers with family survival information:", 
      data_df.loc[data_df['Family_Survival']!=0.5].shape[0])

Number of passengers with family survival information: 420


In [86]:
for _, grp_df in data_df.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(data_df[data_df['Family_Survival']!=0.5].shape[0]))

# # Family_Survival in TRAIN_DF and TEST_DF:
train['Family_Survival'] = data_df['Family_Survival'][:891]
test['Family_Survival'] = data_df['Family_Survival'][891:]

Number of passenger with family/group survival information: 546


In [87]:
#drop
train,test = drop_col(train,test,['Ticket','SibSp','Parch','Name'
                                  
                                  ])

In [88]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,group_size,Null_Age,Zero_Fare,Null_Cabin,FamilySize,Family_Survival
0,1,0,3,0,1.0,0.4,2.0,0,0,0.0,1.0,1.0,0.0,0.4,0.5
1,2,1,1,1,3.0,1.6,0.8,1,2,0.0,1.0,1.0,1.0,0.4,0.5
2,3,1,3,1,2.0,0.4,2.0,0,1,0.0,1.0,1.0,0.0,0.0,0.5
3,4,1,1,1,3.0,1.2,0.8,0,2,0.4,1.0,1.0,1.0,0.4,0.0
4,5,0,3,0,3.0,0.4,2.0,0,0,0.0,1.0,1.0,0.0,0.0,0.5


In [89]:
train = train[['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Cabin',
       'Embarked', 'Title', 'group_size',
       'FamilySize', 'Family_Survival']]
test = test[['PassengerId', 'Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Embarked',
       'Title', 'group_size', 'FamilySize',
       'Family_Survival']]

In [90]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import Ridge
# from sklearn.linear_model import Lasso, ElasticNet

# # alpha값에 따른 회귀 모델의 폴드 평균 RMSE를 출력하고 회귀 계수값들을 DataFrame으로 반환 
# def get_linear_reg_eval(model_name, params=None, X_data_n=None, y_target_n=None, 
#                         verbose=True, return_coeff=True):
#     coeff_df = pd.DataFrame()
#     if verbose : print('####### ', model_name , '#######')
#     for param in params:
#         if model_name =='Ridge': model = Ridge(alpha=param)
#         elif model_name =='Lasso': model = Lasso(alpha=param)
#         elif model_name =='ElasticNet': model = ElasticNet(alpha=param, l1_ratio=0.7)
#         neg_mse_scores = cross_val_score(model, X_data_n, 
#                                              y_target_n, scoring="neg_mean_squared_error", cv = 5)
#         avg_rmse = np.mean(np.sqrt(-1 * neg_mse_scores))
#         print('alpha {0}일 때 5 폴드 세트의 평균 RMSE: {1:.3f} '.format(param, avg_rmse))
#         # cross_val_score는 evaluation metric만 반환하므로 모델을 다시 학습하여 회귀 계수 추출
        
#         model.fit(X_data_n , y_target_n)
#         if return_coeff:
#             # alpha에 따른 피처별 회귀 계수를 Series로 변환하고 이를 DataFrame의 컬럼으로 추가. 
#             coeff = pd.Series(data=model.coef_ , index=X_data_n.columns )
#             colname='alpha:'+str(param)
#             coeff_df[colname] = coeff
    
#     return coeff_df
# # end of get_linear_regre_eval

In [91]:
# def get_scaled_data(method='None', p_degree=None, input_data=None):
#     if method=='Standard':
#         scaled_data = StandardScaler().fit_transform(input_data)
#     elif method =='MinMax':
#         scaled_data = MinMaxScaler().fit_transform(input_data)
#     elif method == 'Log':
#         scaled_data = np.log1p(input_data)
#     else:
#         scaled_data = input_data

#     if p_degree != None:
#         scaled_data = PolynomialFeatures(degree=p_degree
#                                          ,include_bias=False
#                                          ).fit_transform(scaled_data)

#     return scaled_data

In [92]:
# alphas = [0.1,1,10,100]

# scale_methods = [(None,None),('Standard',None),('Standard',2),
#                  ('MinMax',None),('MinMax',2),('Log',None)]

# for scale_method in scale_methods:
#     X_data_scaled = get_scaled_data(method=scale_methods[0]
#                                     ,p_degree=scale_method[1]
#                                     ,input_data=train_data)
#     print('\n ## 변환유형:{0}, Poynomial Degree:{1}'.format(scale_method[0]
#                                                         ,scale_method[1]))
#     get_linear_reg_eval('Ridge', params=alphas, X_data_n=X_data_scaled
#                         , y_target_n=target, verbose=False, return_coeff=False)

In [93]:
train,test = data_scaler(train,test)

In [94]:
train_data, target = split_data(train)

In [95]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,group_size,FamilySize,Family_Survival
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,0.654321,0.352413,0.390348,0.365657,0.59628,0.180696,0.185185,0.205387,0.09046,0.519641
std,257.353842,0.486592,0.418036,0.47799,0.248585,0.246388,0.192977,0.317837,0.263019,0.299548,0.161346,0.323961
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,0.5,0.0,0.2,0.2,0.571429,0.0,0.0,0.0,0.0,0.5
50%,446.0,0.0,1.0,0.0,0.4,0.4,0.714286,0.0,0.0,0.0,0.0,0.5
75%,668.5,1.0,1.0,1.0,0.6,0.6,0.714286,0.5,0.25,0.333333,0.1,0.5
max,891.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [96]:
submission_a = load_submission_a()


In [97]:
clf = RandomForestClassifier(
                            max_depth=5
                              ,n_estimators=57
                             ,min_samples_leaf=45
                             , random_state=228
                             ,n_jobs=-1
                             )
clf.fit(train_data, target)

prediction = clf.predict(test)

accuracy = accuracy_score(prediction, submission_a)

accuracy

0.8373205741626795

In [98]:
submission = pd.DataFrame({

    'PassengerId':test['PassengerId']
    ,'Survived':prediction
    
})
submission.to_csv('submission.csv',index=False)



(0, 0.8157894736842105)