In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.metrics import log_loss, mean_squared_error, roc_auc_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
df_train = pd.read_csv('../input/titanic/train.csv', index_col=0)
df_test = pd.read_csv('../input/titanic/test.csv')

df_train.head()

In [1]:
df_test.head()

## Learning More About the data

In [1]:
df_train.info()

In [1]:
df_test.info()

In [1]:
df_train.describe()

In [1]:
df_train[df_train.duplicated()]

In [1]:
df_test[df_test.duplicated()]

In [1]:
df_train.groupby(['Survived']).count()

In [1]:
df_train.nunique()

In [1]:
df_train.shape

In [1]:
df_test.shape

In [1]:
print('Missing values in train dataset:', df_train.isna().sum()) 
print('-----------------------------------')
print('Missing values in test dataset:', df_test.isna().sum())

In [1]:
features= df_train.drop(['Survived', 'Cabin', 'Name'], axis= 1)
y= df_train['Survived']

In [1]:
numerical_features = [cname for cname in features.columns if (features[cname].dtype == "int64" or features[cname].dtype == "float64")]
print("Numerical Columns =", numerical_features)

In [1]:
n_features = [cname for cname in df_train.columns if (df_train[cname].dtype == "int64" or df_train[cname].dtype == "float64")]
print("Numerical Columns =", n_features)

In [1]:
categorical_features = [cname for cname in features.columns if features[cname].dtype == "object"]
print("Categorical Columns =", categorical_features)

In [1]:
for col in numerical_features:
    fig = plt.figure(figsize=(9, 4))
    sns.kdeplot(df_train[col], shade=True, edgecolor='black', linewidth=1.5, alpha=0.9, zorder=3)
    plt.show()

In [1]:
fig = plt.figure(figsize=(9, 4))
chart_df = pd.DataFrame(df_train['Survived'].value_counts() / len(df_train) * 100)
sns.barplot(x=chart_df.index, y=chart_df['Survived'], zorder=3, edgecolor='black', linewidth=1.5)

In [1]:
fig = plt.figure(figsize=(20, 10))
sns.heatmap(df_train[n_features].corr(), vmin=-1, vmax=1, annot=True, square=True, 
            cbar_kws={"orientation": "horizontal"}, cbar=False, fmt='.1g')

In [1]:
all_data = pd.concat([df_train, df_test])

fig, ax = plt.subplots(3, 2, figsize=(14, 12))
for i, feature in enumerate(n_features):
    plt.subplot(3, 2, i+1)
    sns.histplot(all_data[feature], 
                 color="blue", 
                 kde=True, 
                 bins=100)
    plt.xlabel(feature, fontsize=9)
plt.show()

In [1]:
numerical_transformer_steps = [
    ('imputer', SimpleImputer(strategy= 'median')),
    ('scaler', StandardScaler())]
numerical_transformer= Pipeline(steps= numerical_transformer_steps)


categorical_transformer_steps= [
    ('imputer', SimpleImputer(strategy= 'constant', fill_value= 'missing')),
    ('onehot', OneHotEncoder(handle_unknown= 'ignore'))
]
categorical_transformer= Pipeline(steps= categorical_transformer_steps)


col_transformers= [
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
]
preprocessor= ColumnTransformer(transformers= col_transformers)

In [1]:
X_train, X_val, y_train, y_val= train_test_split(features, y, test_size= 0.2, random_state= 12)

In [1]:
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}
# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
}
# XGBOOST parameters
xgb_params= {
    #learning_rate = 0.02,
     'n_estimators': 2000,
     'max_depth': 4,
     'min_child_weight': 2,
     #'gamma': 1,
     'gamma': 0.9,                        
     'subsample': 0.8,
     'colsample_bytree': 0.8,
     'objective': 'binary:logistic',
     'nthread': -1,
     'scale_pos_weight': 1
}


In [1]:
et = ExtraTreesClassifier(**et_params)
rf = RandomForestClassifier(**rf_params)
svc= SVC(**svc_params)
gbm= xgb.XGBClassifier(**xgb_params)

In [1]:
model_et= Pipeline(steps= [
    ('preprocessor', preprocessor),
    ('clf', et)
])
model_et.fit(X_train, y_train)
preds_et = model_et.predict(X_val)
model_et.score(X_train, y_train)

In [1]:
model_rf= Pipeline(steps= [
    ('preprocessor', preprocessor),
    ('clf', rf)
])
model_rf.fit(X_train, y_train)
preds_rf = model_rf.predict(X_val)
model_rf.score(X_train, y_train)

In [1]:
model_svc= Pipeline(steps= [
    ('preprocessor', preprocessor),
    ('clf', svc)
])
model_svc.fit(X_train, y_train)
preds_svc = model_svc.predict(X_val)
model_svc.score(X_train, y_train)

In [1]:
model_gbm= Pipeline(steps= [
    ('preprocessor', preprocessor),
    ('clf', gbm)
])
model_gbm.fit(X_train, y_train)
preds_gbm = model_gbm.predict(X_val)
model_gbm.score(X_train, y_train)

In [1]:
test_features= df_test.drop(['Cabin', 'Name'], axis= 1)

In [1]:
y_pred= model_gbm.predict(test_features)

In [1]:
submission = pd.DataFrame({
        "PassengerId": df_test["PassengerId"],
        "Survived": y_pred
    })

In [1]:
submission.to_csv('submission.csv', index=False)