In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from catboost import CatBoostClassifier
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold


In [2]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [3]:
def label_encoder(c):
    lc = LabelEncoder()
def age_range(age):
    if age >= 0 and age <= 5:
        return "baby"
    elif age >= 6 and age <= 16:
        return "children"
    elif age >= 17 and age <= 29:
        return "young_adult"
    elif age >= 30 and age <=49:
        return "middle_aged"
    else:
        return "senior"
    return lc.fit_transform(c)
def preprocess(df):
    age_map = df[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
    df.Age = df.Age.fillna(df.Pclass.map(age_map['Age']))
    df.Cabin = df.Cabin.fillna('X').map(lambda x: x[0].strip())
    df.Ticket = df.Ticket.fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
    df.Fare = df.Fare.fillna(df.Fare.mean())
    df.Fare = df.Fare.map(lambda i: np.log(i) if i > 0 else 0)
    df.Embarked = df.Embarked.fillna('X')
    df.Name = df.Name.map(lambda x: x.split(',')[0])
    df["age_range"] = df["Age"].apply(age_range)
    df["Family"] = df["SibSp"] + df["Parch"] +1   
    df["Alone"] =  df["Family"].map(lambda x : "Yes" if x == 1 else "No")
    label_cols = ['Name', 'Ticket',"age_range","Alone"]
    onehot_cols = ['Pclass', 'Sex', 'Cabin', 'Embarked']
    numerical_cols = ['SibSp', 'Parch', 'Fare','Survived',"Family"]  
    onehot_encoded_df = pd.get_dummies(df[onehot_cols])
    label_encoded_df = df[label_cols].apply(label_encoder)
    numerical_df = df[numerical_cols]
    return pd.concat([numerical_df,onehot_encoded_df], axis=1)

def test_preprocess(df):
    age_map = df[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
    df.Age = df.Age.fillna(df.Pclass.map(age_map['Age']))
    df.Cabin = df.Cabin.fillna('X').map(lambda x: x[0].strip())
    df.Ticket = df.Ticket.fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
    df.Fare = df.Fare.fillna(df.Fare.mean())
    df.Fare = df.Fare.map(lambda i: np.log(i) if i > 0 else 0)
    df.Embarked = df.Embarked.fillna('X')
    df.Name = df.Name.map(lambda x: x.split(',')[0])
    df["age_range"] = df["Age"].apply(age_range)
    df["Family"] = df["SibSp"] + df["Parch"] +1   
    df["Alone"] =  df["Family"].map(lambda x : "Yes" if x == 1 else "No")
    label_cols = ['Name', 'Ticket',"age_range","Alone"]
    onehot_cols = ['Pclass', 'Sex', 'Cabin', 'Embarked']
    numerical_cols = ['SibSp', 'Parch', 'Fare',"Family"]  
    onehot_encoded_df = pd.get_dummies(df[onehot_cols])
    label_encoded_df = df[label_cols].apply(label_encoder)
    numerical_df = df[numerical_cols]
    return pd.concat([numerical_df,onehot_encoded_df], axis=1)

In [4]:
scaler = StandardScaler()
train = preprocess(df = train_df)
test = test_preprocess(df=test_df)
y = train.pop("Survived")
# x = train.values
x = train.values


In [5]:
#  params = {'learning_rate': 0.0811247967716156,
#   'min_data_in_leaf': 100,
#   'lambda_l1': 0.016409057851832026,
#   'lambda_l2': 1.507793728677616e-08,
#   'num_leaves': 143,
#   'max_depth': 6,
#   'feature_fraction': 0.5173793819319641,
#   'bagging_fraction': 0.9378987379069205,
#   'bagging_freq': 3,
#   'max_bin': 150}

In [6]:
params = {"task_type":"GPU",
          'loss_function':'Logloss',
          'eval_metric':'AUC', 
          'early_stopping_rounds': 500,
          'n_estimators': 10000}

In [7]:
# test_acc = CatBoostClassifier(**params)
# sk_fold = StratifiedKFold(10)
# print(cross_val_score(test_acc,x,y, n_jobs=-1, cv=sk_fold,scoring='accuracy').mean())
# print(cross_val_score(test_acc,x,y, n_jobs=-1, cv=sk_fold,scoring='f1').mean())

In [8]:
clf = CatBoostClassifier(**params)
clf.fit(x,y)

Learning rate set to 0.003325
0:	learn: 0.8387866	total: 21.4ms	remaining: 3m 33s
1:	learn: 0.8426511	total: 31.3ms	remaining: 2m 36s
2:	learn: 0.8427280	total: 41.4ms	remaining: 2m 17s
3:	learn: 0.8429595	total: 51.2ms	remaining: 2m 8s
4:	learn: 0.8429018	total: 61.2ms	remaining: 2m 2s
5:	learn: 0.8428829	total: 71ms	remaining: 1m 58s
6:	learn: 0.8429870	total: 80.9ms	remaining: 1m 55s
7:	learn: 0.8429856	total: 90.8ms	remaining: 1m 53s
8:	learn: 0.8434067	total: 101ms	remaining: 1m 52s
9:	learn: 0.8433922	total: 111ms	remaining: 1m 50s
10:	learn: 0.8434232	total: 121ms	remaining: 1m 50s
11:	learn: 0.8436966	total: 131ms	remaining: 1m 49s
12:	learn: 0.8437472	total: 141ms	remaining: 1m 48s
13:	learn: 0.8437909	total: 151ms	remaining: 1m 47s
14:	learn: 0.8438488	total: 161ms	remaining: 1m 47s
15:	learn: 0.8437689	total: 171ms	remaining: 1m 46s
16:	learn: 0.8437704	total: 181ms	remaining: 1m 46s
17:	learn: 0.8437991	total: 191ms	remaining: 1m 45s
18:	learn: 0.8439856	total: 201ms	remain

<catboost.core.CatBoostClassifier at 0x7f6c5d90cf10>

In [9]:
preds = clf.predict(test)

In [10]:
submit = pd.DataFrame({"PassengerId":test_df["PassengerId"],"Survived":preds})
submit.head(10)

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,0
2,100002,1
3,100003,0
4,100004,1
5,100005,0
6,100006,1
7,100007,0
8,100008,0
9,100009,0


In [11]:
submit.to_csv("submission.csv",index = False)