In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# Setting matplotlib defaults
plt.style.use('seaborn-v0_8')
plt.rc('figure', figsize=(8, 5), dpi=145)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=15, titlepad=10)


In [3]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
print('Missing values on train data')
train_df.isnull().sum()

In [None]:
print('Missing values on test data')
test_df.isnull().sum()

In [None]:
train_df.describe()

In [None]:
train_df.nunique()

In [None]:
sns.countplot(x='Survived', data=train_df, palette='Spectral')
plt.title('Count Plot of Not Survived and Survived')

Feature Engineering

In [4]:
train_df['Family'] = train_df['SibSp'] + train_df['Parch'] + 1
train_df['Family_size'] = pd.cut(
    train_df['Family'],
    bins=[1, 2, 6, float('inf')],
    labels=['alone', 'small', 'large'],
    right=False
)

test_df['Family'] = test_df['SibSp'] + test_df['Parch'] + 1
test_df['Family_size'] = pd.cut(
    train_df['Family'],
    bins=[1, 2, 6, float('inf')],
    labels=['alone', 'small', 'large'],
    right=False
)


In [5]:
train_df['Title'] = train_df['Name'].str.extract(r',\s*([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df['Name'].str.extract(r',\s*([A-Za-z]+)\.', expand=False)
train_df['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', nan, 'Jonkheer'],
      dtype=object)

In [6]:
title_mapping = {
    'Mlle': 'Miss', 'Ms':'Miss', 'Mme': 'Mrs',
    'Lady': 'Royalty', 'Countess': 'Royalty', 'Don': 'Royalty', 
    'Dona': 'Royalty', 'Sir': 'Royalty', 'Jonkheer': 'Royalty',
    'Capt':'Officer', 'Col': 'Officer', 'Dr':'Officer',
    'Major': 'Officer', 'Rev': 'Officer'
}

train_df['Title'] = train_df['Title'].replace(title_mapping)
train_df['Title'] = train_df['Title'].fillna('median')

test_df['Title'] = test_df['Title'].replace(title_mapping)
test_df['Title'] = test_df['Title'].fillna('median')

In [7]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
train_df['Age'] = imputer.fit_transform(train_df[['Age']])

imputer = SimpleImputer(strategy='most_frequent')
train_df[['Cabin', 'Embarked']] = imputer.fit_transform(train_df[['Cabin', 'Embarked']])

imputer = SimpleImputer(strategy='mean')
test_df['Age'] = imputer.fit_transform(test_df[['Age']])

imputer = SimpleImputer(strategy='most_frequent')
test_df[['Cabin', 'Fare']] = imputer.fit_transform(test_df[['Cabin', 'Fare']])

In [8]:
labels = [0, 1, 2, 3, 4]
bins = [0, 12, 18, 35, 60, 100]
train_df['age_group'] = pd.cut(train_df['Age'], bins=bins, labels=labels, include_lowest=True) 
test_df['age_group'] = pd.cut(test_df['Age'], bins=bins, labels=labels, include_lowest=True) 

In [9]:
train_df['fare_bin'] = pd.qcut(train_df['Fare'], 4, labels=[0, 1, 2, 3])
test_df['fare_bin'] = pd.qcut(test_df['Fare'], 4, labels=[0, 1, 2, 3])

In [10]:
from sklearn.preprocessing import LabelEncoder

l_features = ['Sex', 'Embarked', 'Family_size', 'Ticket', 'Title', 'Cabin']
for col in l_features:
   le = LabelEncoder()
   train_df[col] = le.fit_transform(train_df[col])
   test_df[col] = le.fit_transform(test_df[col])

In [11]:
train_df['fare_pclass'] = train_df['Fare'] * train_df['Pclass']
test_df['fare_pclass'] = test_df['Fare'] * test_df['Pclass']
train_df['age_pclass'] = train_df['Age'] * train_df['Pclass']
test_df['age_pclass'] = test_df['Age'] * test_df['Pclass']

In [None]:
train_df.columns

In [12]:
train_df.drop(['Name', 'SibSp', 'Parch'], axis=1, inplace=True)
test_df.drop(['Name', 'SibSp', 'Parch'], axis=1, inplace=True)

In [13]:
X = train_df.copy()
y = X.pop('Survived')
X_test = test_df.copy()


In [14]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import RobustScaler

kf = KFold(n_splits=10, shuffle=True, random_state=540)
for train_index, valid_index in kf.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [15]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

rfc = RandomForestClassifier(n_estimators=250, random_state=540)
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_valid)
acc = accuracy_score(y_valid, y_pred_rfc)
print(f"Accuracy score: {acc:,.5f}")


Accuracy score: 0.85393


In [16]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

rfc = RandomForestClassifier(n_estimators=300, random_state=540)
abc = AdaBoostClassifier(n_estimators=140, random_state=540)
gpc = GaussianProcessClassifier(random_state=540, kernel=RBF())
gbc = GradientBoostingClassifier(max_leaf_nodes=2, random_state=540)
ensemble_model = VotingClassifier(estimators=[('rfc', rfc), ('abc', abc), ('gpc', gpc), ('gbc', gbc)])
ensemble_model.fit(X_train, y_train)
y_pred_em = ensemble_model.predict(X_valid)
acc = accuracy_score(y_valid, y_pred_em)
print(f"Accuracy score: {acc:,.5f}")

Accuracy score: 0.84270


In [17]:
test_preds = ensemble_model.predict(X_test)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': test_preds})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
