In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

<h2> Loading the data </h2>

In [None]:
train_data = pd.read_csv('../input/titanic/train.csv')
train_data.head()

In [None]:
test_data = pd.read_csv('../input/titanic/test.csv')
test_data.head()

<h2><b><u>Exploratory Data Analysis </u></b>

**<h3>Check whether there are any missing values present in the data** </h3>

In [None]:
plt.figure(figsize=(8,4))
sns.heatmap(train_data.isna(), cmap='viridis', yticklabels=False, cbar=False) # heatmap for showing missing values

* From the above heatmap we can see that there are missing values present in the feature Age, Cabin

<h3><b> The number of people survived and deceased </b> </h3>

In [None]:
sns.countplot(x='Survived',data=train_data, palette= 'Set1')
# 1 -> passenger survived
# 0 -> passenger deceased

In [None]:
sns.countplot(x='Survived', hue='Sex', data=train_data, palette='Set1')

# from this plot, it's visible that many males were survied and many female died

In [None]:
sns.countplot(x='Survived', hue='Pclass', data=train_data, palette= 'Set1')
# here we observe that passengers in class 3 were mostly survived as compared to other classes in Titanic

<h3><b> Age Distribution of the passengers in titanic </b> </h3>

In [None]:
sns.displot(train_data['Age'].dropna(), bins=30)

<h3><b> Distribution of Fare amount in Titanic </b> </h3>

In [None]:
sns.displot(train_data['Fare'], bins=40)

<h2><b><u>Handling Missing Values </u> </b> </h2>

In [None]:
sns.boxplot(x='Pclass', y='Age', data=train_data)

* We see that wealthier people of class 1, class 2 are tend to be old as compared to the class 3 people
* Which indicates that older people are tend to be rich and chose either of Class 1 & 2
* Whereas younger people chose class 3

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isna(Age):
        if Pclass == 1:     # from the boxplot we are taking meadian values from the Pclass
            return 37
        if Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

In [None]:
train_data['Age'] = train_data[['Age','Pclass']].apply(impute_age, axis=1)
test_data['Age'] = test_data[['Age','Pclass']].apply(impute_age, axis=1)

In [None]:
train_data.drop('Cabin',axis=1, inplace= True) # this feature is categorical andf doesn't help as feature for predictions
test_data.drop('Cabin',axis=1, inplace= True)

In [None]:
sns.heatmap(train_data.isna(),cmap='viridis', yticklabels=False, cbar=False)
# me made sure that there are no missing values present in the Age column

<h2><b><u>Creating dummy variable of categorical variables </u> </b> </h2>

In [None]:
sex= pd.get_dummies(train_data['Sex'], drop_first=True)
embark = pd.get_dummies(train_data['Embarked'], drop_first=True)

train_data = pd.concat([train_data, sex, embark], axis=1)
train_data.drop(['Sex','Embarked','Name','Ticket'], axis=1, inplace= True)
train_data.head()

In [None]:
sex= pd.get_dummies(test_data['Sex'], drop_first=True)
embark = pd.get_dummies(test_data['Embarked'], drop_first=True)

test_data = pd.concat([test_data, sex, embark], axis=1)
test_data.drop(['Sex','Embarked','Name','Ticket'], axis=1, inplace= True)
test_data.head()

In [None]:
mean = test_data['Fare'].mean()
test_data['Fare'].fillna(mean, inplace= True)

In [None]:
test_data.isna().sum()

<h2><b><u> Training and Prediciting the model </u> </b> </h2>

In [None]:
X_train = train_data.drop(['Survived','PassengerId'],axis=1)
X_test = test_data.drop('PassengerId', axis=1)

In [None]:
y_train = train_data['Survived']

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)  # initializing the model

In [None]:
model.fit(X_train, y_train)

In [None]:
prediction = model.predict(X_test)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
ada = AdaBoostClassifier()

grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=ada, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
# execute the grid search
grid_result = grid_search.fit(X_train, y_train)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
import xgboost as xgb
gbm = xgb.XGBClassifier(learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 nthread= -1,
 scale_pos_weight=1).fit(X_train, y_train)
xgb_predictions = gbm.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': xgb_predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")