In [None]:
import os
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
Train_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
Test_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')

In [None]:
Train_df.head()

In [None]:
Test_df.head()

In [None]:
sub.head()

In [None]:
Train_df.shape

In [None]:
Train_df.isna().sum()

In [None]:
not_need_columns = [
    'PassengerId', 'Name', 'Cabin'
]

In [None]:
train_df = Train_df.drop(axis=1, columns=not_need_columns)
test_df = Test_df.drop(axis=1, columns=not_need_columns)

In [None]:
train_df.info()

In [None]:
le = LabelEncoder()

train_df['Sex'] = le.fit_transform(train_df['Sex'])
train_df['Embarked'] = le.fit_transform(train_df['Embarked'])
train_df['Age'] = train_df['Age'].fillna(value=train_df['Age'].mean())

test_df['Sex'] = le.fit_transform(test_df['Sex'])
test_df['Embarked'] = le.fit_transform(test_df['Embarked'])
test_df['Age'] = test_df['Age'].fillna(value=test_df['Age'].mean())

In [None]:
test_df.info()

In [None]:
tickets = train_df['Ticket'].str.split()
tickets = [
    ticket[-1] if len(ticket) > 1 else ticket[0] 
    for ticket in [
        ['9999999999'] if ticket is np.nan else ticket 
    for ticket in tickets]
]
tickets = [int(elem) if elem.isdigit() else 9999999999 for elem in tickets]
train_df['Ticket'] = tickets

tickets = test_df['Ticket'].str.split()
tickets = [
    ticket[-1] if len(ticket) > 1 else ticket[0] 
    for ticket in [
        ['9999999999'] if ticket is np.nan else ticket 
    for ticket in tickets]
]
tickets = [int(elem) if elem.isdigit() else 9999999999 for elem in tickets]
test_df['Ticket'] = tickets

test_df['Fare'] = test_df['Fare'].fillna(value=test_df['Fare'].mean())
train_df['Fare'] = train_df['Fare'].fillna(value=train_df['Fare'].mean())

In [None]:
train_df.info()

In [None]:
test_df['Fare'] = test_df['Fare'].fillna(value=test_df['Fare'].mean())
train_df['Fare'] = train_df['Fare'].fillna(value=train_df['Fare'].mean())

# Modeling

In [None]:
Y_train = train_df.loc[:, ['Survived']]
X_train = train_df.drop(axis=1, columns=['Survived'])
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train,
                                                    test_size=0.33,
                                                    random_state=42)

### 2.1 RandomForestClassifier, GridSearchCV

In [None]:
selector = SelectKBest(k=5)
rfc = RandomForestClassifier()
pipe = Pipeline(steps=[('selector', selector), ('rfc', rfc)])

parameters = {'rfc__n_estimators':[13, 25, 50, 102, 124], 
              'rfc__max_depth': [5, 7, 18, 47, 100],
              'rfc__min_samples_split': [1, 2, 3],
              'rfc__random_state': [5, 23, 38, 42]}
g_search = GridSearchCV(pipe, parameters, n_jobs=-1)

In [None]:
g_fit = g_search.fit(x_train, y_train)
best_clf = g_fit.best_estimator_
print(best_clf)

In [None]:
best_predictions = best_clf.predict(x_test)
accuracy_score(best_predictions, y_test)

In [None]:
y_pred_best_clf = best_clf.predict(test_df)

### 2.3 Stacking models

In [None]:
estimators = [
    ('rfc', RandomForestClassifier(max_depth=70, 
                                   n_estimators=58, 
                                   min_samples_split=2,
                                   n_jobs=-1,
                                   random_state=3)),
    ('lgbm', LGBMClassifier(max_depth=123,
                            n_estimators=95,
                            n_jobs=-1,
                            num_leaves=65,
                            random_state=95),
    ('xgb', XGBClassifier(max_depth=150, 
                          n_estimators=95, 
                          random_state=45, 
                          n_jobs=-1)))
]

In [None]:
stack_clfs = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(C=0.09, 
                                                              max_iter=150,
                                                              random_state=65,
                                                              solver='liblinear',
                                                              n_jobs=-1)
)

In [None]:
stack_clfs.fit(x_train, y_train)

### 2.4 Pipeline, StandardScaller

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
params = {'n_estimators': 500,
          'objective': 'binary',
          'max_depth': 250,
          'num_leaves': 180,
          'random_state': 42,
          'metric': 'auc',
          'n_jobs': -1}

lgbm = LGBMClassifier(**params)

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lgbm', lgbm),
])

In [None]:
pipeline.fit(x_train, y_train)

In [None]:
y_pred = pipeline.predict(x_test)
accuracy_score(y_pred, y_test)

In [None]:
y_pred = lgbm.predict(test_df)

In [None]:
submission = pd.DataFrame({
        "PassengerId": Test_df["PassengerId"],
        "Survived": y_pred_best_clf
    })

In [None]:
submission.to_csv('submission.csv', index=False)

# Thanks for reading. Don't forget to upvote the work. Good luck kaggling!