In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## <p style="background-color:magenta; font-family:newtimeroman; font-size:140%; text-align:center; border-radius: 15px 50px;">Import Required Libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

plt.style.use("fivethirtyeight")
sns.set_style("darkgrid")

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold

## <p style="background-color:magenta; font-family:newtimeroman; font-size:140%; text-align:center; border-radius: 15px 50px;">Load Data

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
sub = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv")

## First 5 rows
- Print First 5 rows in the train dataset

In [None]:
train.head()

- Print First 5 rows in the test dataset

In [None]:
test.head()

- Print First 5 rows in the submission dataset

In [None]:
sub.head()

### Numbers of rows and columns

In [None]:
print('Rows and Columns in train dataset:', train.shape)
print('Rows and Columns in test dataset:', test.shape)

## <p style="background-color:magenta; font-family:newtimeroman; font-size:140%; text-align:center; border-radius: 15px 50px;">EDA(Exploratory Data Analysis)

### a) Missing Values

In [None]:
print('Missing values per columns in train dataset')
for col in train.columns:
    train_col = train[col].isnull().sum()
    print(f'{col}: {train_col}')

In [None]:
print('Missing values per columns in test dataset')
for col in test.columns:
    test_col = test[col].isnull().sum()
    print(f'{col}: {test_col}')

## <p style="background-color:magenta; font-family:newtimeroman; font-size:140%; text-align:center; border-radius: 15px 50px;">Feature Engineering

In [None]:
train = train.drop(['PassengerId', 'Name', 'Cabin'], axis=1)
test = test.drop(['PassengerId', 'Name', 'Cabin'], axis=1)

In [None]:
display(train.head(2))
display(test.head(2))

In [None]:
# Fill Missing values in train and test
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())

train['Fare'] = train['Fare'].fillna(train['Fare'].mean())
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

In [None]:
le = LabelEncoder()

train['Sex'] = le.fit_transform(train['Sex'])
train['Embarked'] = le.fit_transform(train['Embarked'])

test['Sex'] = le.fit_transform(test['Sex'])
test['Embarked'] = le.fit_transform(test['Embarked'])

In [None]:
tickets = train['Ticket'].str.split()
tickets = [
    ticket[-1] if len(ticket) > 1 else ticket[0] 
    for ticket in [
        ['9999999999'] if ticket is np.nan else ticket 
    for ticket in tickets]
]
tickets = [int(elem) if elem.isdigit() else 9999999999 for elem in tickets]
train['Ticket'] = tickets

tickets = test['Ticket'].str.split()
tickets = [
    ticket[-1] if len(ticket) > 1 else ticket[0] 
    for ticket in [
        ['9999999999'] if ticket is np.nan else ticket 
    for ticket in tickets]
]
tickets = [int(elem) if elem.isdigit() else 9999999999 for elem in tickets]
test['Ticket'] = tickets

In [None]:
y = train.loc[:, ['Survived']]
X = train.drop(axis=1, columns=['Survived'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## <p style="background-color:magenta; font-family:newtimeroman; font-size:140%; text-align:center; border-radius: 15px 50px;">Auto-Sklearn

In [None]:
!pip install auto-sklearn

In [None]:
import autosklearn
print(autosklearn.__version__)

In [None]:
import autosklearn.classification
cls = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=10*60,
    per_run_time_limit=60,
    n_jobs=-1)
cls.fit(X_train, y_train)

In [None]:
predictions = cls.predict(test)

In [None]:
# Print the final ensemble constructed by auto-sklearn
print(cls.show_models())

## <p style="background-color:magenta; font-family:newtimeroman; font-size:140%; text-align:center; border-radius: 15px 50px;">Submission

In [None]:
sub['Survived'] = predictions
sub.to_csv('submission.csv', index=False)