In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import  StratifiedKFold
from sklearn import metrics

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

ctypes = {
    'Survived':np.int8,
    'Pclass':np.int8,
    'Name':np.str,
    'Embarked':np.str,  
    'SibSp':np.int8,
    'Parch':np.int8,
}

           
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv',dtype=ctypes,index_col='PassengerId')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv',dtype=ctypes,index_col='PassengerId')
submission = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv',dtype=ctypes,index_col='PassengerId')


In [None]:
train['Embarked'] = train['Embarked'].fillna('No')
test['Embarked'] = test['Embarked'].fillna('No')

train['Cabin'] = train['Cabin'].fillna('_')
test['Cabin'] = test['Cabin'].fillna('_')

train.Ticket = train.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
test.Ticket = test.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

train['CabinType'] = train['Cabin'].apply(lambda x:x[0])
test['CabinType'] = test['Cabin'].apply(lambda x:x[0])

train['Age'].fillna(round(train['Age'].mean()), inplace=True,)
test['Age'].fillna(round(test['Age'].mean()), inplace=True,)
train['Age'] = train['Age'].apply(round)
test['Age'] = test['Age'].apply(round)
train['Age'] = train['Age'].astype(np.int8)
test['Age'] = test['Age'].astype(np.int8)


train['Fare'].fillna(round(train['Fare'].mean()), inplace=True,)
test['Fare'].fillna(round(test['Fare'].mean()), inplace=True,)

train['FirstName'] = train['Name'].apply(lambda x:x.split(', ')[0])
train['SecondName'] = train['Name'].apply(lambda x:x.split(', ')[1])

test['FirstName'] = test['Name'].apply(lambda x:x.split(', ')[0])
test['SecondName'] = test['Name'].apply(lambda x:x.split(', ')[1])

train['n'] = 1
test['n'] = 1

gb = train.groupby('FirstName')
df_names = gb['n'].sum()
train['SameFirstName'] = train['FirstName'].apply(lambda x:df_names[x])

gb = test.groupby('FirstName')
df_names = gb['n'].sum()
test['SameFirstName'] = test['FirstName'].apply(lambda x:df_names[x])

train['SameFirstName'] = train['SameFirstName'].apply(lambda x:-1 if x>10 else x)
test['SameFirstName'] = test['SameFirstName'].apply(lambda x:-1 if x>10 else x)

train_female = train[train.Sex=='female']
train_male = train[train.Sex=='male']

In [None]:

columns = ['Pclass',  'Age','Embarked','Parch','SibSp','Fare','CabinType','Ticket','SameFirstName']
cat_features = ['Pclass','Embarked','CabinType','Ticket',]

models_f = []
num_folds=9
folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=2021) # create folds 
X_train = train_female[columns]
y_train = train_female['Survived']
for n_fold, (train_idx, valid_idx) in enumerate (folds.split(X_train,  y_train)):
    train_X, train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
    valid_X, valid_y = X_train.iloc[valid_idx], y_train.iloc[valid_idx]
    dataset = Pool(train_X, train_y, cat_features)
    evalset = Pool(valid_X, valid_y, cat_features)
    model_female = CatBoostClassifier(
        task_type="GPU", 
        depth=6,
        max_ctr_complexity=5,
        #border_count=1024, 
        iterations=50000,
        od_wait=500,od_type='Iter',       
        #l2_leaf_reg=0.01,
        learning_rate=0.04,
        min_data_in_leaf=3
    
        )
    model_female.fit(dataset, plot=False, verbose=500,eval_set=evalset)
    models_f.append(model_female)
    y_pred_female = model_female.predict(train_female[columns])
    print(metrics.accuracy_score(train_female['Survived'], y_pred_female))
    

In [None]:

columns = ['Pclass',  'Age','Embarked','Parch','SibSp','Fare','CabinType','Ticket']
cat_features = ['Pclass','Embarked','CabinType','Ticket']

models_m = []
num_folds=9
folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=2021) # create folds 
X_train = train_male[columns]
y_train = train_male['Survived']
for n_fold, (train_idx, valid_idx) in enumerate (folds.split(X_train,  y_train)):
    train_X, train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
    valid_X, valid_y = X_train.iloc[valid_idx], y_train.iloc[valid_idx]
    dataset = Pool(train_X, train_y, cat_features)
    evalset = Pool(valid_X, valid_y, cat_features)
    model_male = CatBoostClassifier(
        task_type="GPU", 
        depth=6,
        max_ctr_complexity=15,
        #border_count=1024, 
        iterations=50000,
        od_wait=400,od_type='Iter',       
        #l2_leaf_reg=0.01,
        learning_rate=0.04,
        min_data_in_leaf=3
        )
    model_male.fit(dataset, plot=False, verbose=500,eval_set=evalset)
    models_m.append(model_male)
    y_pred_male = model_male.predict(train_male[columns])
    print(metrics.accuracy_score(train_male['Survived'], y_pred_male))
    


In [None]:
%%time
columns = ['Pclass',  'Age','Embarked','Parch','SibSp','Fare','CabinType','Ticket','SameFirstName']
m_columns_f = []
for idx,m in enumerate(models_f):
    new_column = 'fm_{}'.format(idx)
    m_columns_f.append(new_column)
    test[new_column] = m.predict(test[columns])
    print(new_column, end=' ')
print()
m_columns_m = []
columns = ['Pclass',  'Age','Embarked','Parch','SibSp','Fare','CabinType','Ticket']
for idx,m in enumerate(models_m):
    new_column = 'm_{}'.format(idx)
    m_columns_m.append(new_column)
    test[new_column] = m.predict(test[columns])
    print(new_column, end=' ')

In [None]:
def vote(r, columns):
    ones = 0
    zeros = 0
    for i in columns:
        if r[i]==0:
            zeros+=1
        else:
            ones+=1
    if ones>zeros:
        return 1
    else:
        return 0

test['model_female'] = test.apply(lambda x:vote(x,m_columns_f),axis=1)
test['model_male'] = test.apply(lambda x:vote(x,m_columns_m),axis=1)

In [None]:
def _s(r):
    if r.Sex=='male':
        return r.model_male
    else:
        return r.model_female
    
submission['Survived'] = test.apply(lambda x:_s(x),axis=1)

In [None]:
#LB 80588
submission.to_csv('result.csv')

In [None]:
submission['Survived'].mean(), train['Survived'].mean()