In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Process Data

In [None]:
def process_data(filename):
    """
    Preprocess data to add some new attributes
    (inspired by user superant)
    """
    cat_types={}
    dd=pd.read_csv(filename).set_index("PassengerId")
    
    # Break tickets to broader categories
    dd["Ticket2"]=dd["Ticket"].str[:2]
    # All passengers with nan cabin assumed to have NO cabin
    dd["Cabin"]=dd["Cabin"].fillna('No')
    
    # Most popular ticket type
    dd["Ticket2"]=dd["Ticket2"].fillna('PC')
    # Most popular embarked location
    dd["Embarked"]=dd["Embarked"].fillna('S')
    dd["Fare"]=dd["Fare"].fillna(dd['Fare'].median())
    
    # Break cabin to larger categories
    dd["Cabin2"]=dd["Cabin"].str[:2]
    # Extract numbers from tickets and fill nan with median
    dd["Ticket_nums"]=dd["Ticket"].str.extract("([0-9]{3,})").astype(float)
    dd["Ticket_nums"]=dd["Ticket_nums"].fillna(dd["Ticket_nums"].median())
    # Fill nan age with median
    dd["Age"]=dd["Age"].fillna(dd["Age"].median())
    # Convert sex to boolean
    dd["Sex"]=dd["Sex"].apply(lambda x: x=='male').astype(object)
    # Set categorical attributes as category type
    for cat_col in ["Sex", "Embarked", "Ticket2", "Cabin2"]:
        if cat_col not in cat_types:
            dd[cat_col]=dd[cat_col].astype("category")
            cat_types[cat_col]=dd[cat_col].cat.categories
    
    return dd

In [None]:
X=process_data("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
X_test=process_data("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
X

In [None]:
# We will train with these attributes
feats=["Pclass", "Sex", "SibSp", "Parch", "Embarked", "Fare", "Ticket2", "Cabin2", "Ticket_nums", "Age"]
# And use Survived as our target
y=X["Survived"]

X=X[feats]
X_test=X_test[feats]

In [None]:
# One hot all categorical data to help learning algorithms
X = pd.get_dummies(X)
p_id = X_test.index
X_test = pd.get_dummies(X_test)
X

In [None]:
# Drop some attributes that are not pressent in both datasets, because we need them to be identical
X = X.drop([i for i in X.columns if i not in X_test.columns], axis=1)
X_test = X_test.drop([i for i in X_test.columns if i not in X.columns], axis=1)

X.shape, X_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.fit_transform(X_test)

# Train Models and Ensemble

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score

lgbmc_clf = LGBMClassifier(learning_rate=0.05, num_leaves=25, n_estimators=350)
lgbmc_clf.fit(X, y)
cross_val_score(lgbmc_clf, X, y, cv=3).mean()

In [None]:
import xgboost

xgb_clf = xgboost.XGBClassifier(eval_metric='logloss')
xgb_clf.fit(X, y)
cross_val_score(xgb_clf, X, y, cv=3).mean()

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=250, algorithm="SAMME.R", learning_rate=0.05)
ada_clf.fit(X, y)
cross_val_score(ada_clf, X, y, cv=3).mean()

In [None]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[('lgbmc', lgbmc_clf), ('ada', ada_clf), ('boost', xgb_clf)], voting='soft')
cross_val_score(voting_clf, X[:], y[:], cv=3, n_jobs=-1)

# Make predictions

In [None]:
voting_clf.fit(X[:], y[:])
y_pred = voting_clf.predict(X_test[:])

submission = pd.DataFrame({'PassengerId': p_id, 'Survived': y_pred})
submission.to_csv('./ensemble_submission.csv', index = False)