In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# load the train data
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")

In [None]:
# looking at the train data - top 5 rows
train_data.head()

In [None]:
# load the test data
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
# looking at the test data - top 5 rows
test_data.head()

In [None]:
# percentage of women survives
women = train_data.loc[train_data.Sex == 'female']['Survived']
rate_women = sum(women)/len(women)*100
print('{:.2f}% Women Survived'.format(rate_women))

In [None]:
# percentage of men survives
men = train_data[train_data.Sex=='male'].Survived
rate_men = sum(men)/len(men)*100
print('{:.2f}% Men Survived'.format(rate_men))

In [None]:
# looking at the data types. Might need some modifications
train_data.info()

In [None]:
# looking at the null values
train_data.isnull().sum()

In [None]:
# looking at the missing values heatmap
sns.heatmap(train_data.isnull(), yticklabels = False, cmap = 'viridis')

# Finding: A lot of missing values in "Age" and "Cabin" Column. We can do something to impute those values

In [None]:
train_data.corr()

In [None]:
sns.heatmap(train_data.corr())

In [None]:
# Age has highest correlation with PClass. Let's see if we can impute Age using Pclass.

plt.figure(figsize = (12,12))
sns.boxplot(x = 'Pclass', y = 'Age', data = train_data)

In [None]:
'''
from the plot above, we can impute the age as
1) 38 if the passenger is in class 1
2) 29 if the passenger is in class 2
3) 24 if the passenger is in class 3


So, let's work on the function to impute age
'''

def impute_age(col):
    Age = col[0]
    Pclass = col[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 38
        elif Pclass == 2:
            return 29
        elif Pclass == 3:
            return 24
    else:
        return Age

In [None]:
train_data['Age'] = train_data[['Age', 'Pclass']].apply(impute_age, axis = 1)

In [None]:
# looking at the missing values heatmap
sns.heatmap(train_data.isnull(), yticklabels = False, cmap = 'viridis')

In [None]:
#Dropping Cabin Column

train_data2 = train_data.drop('Cabin', axis = 1)

In [None]:
# looking at the missing values heatmap
sns.heatmap(train_data2.isnull(), yticklabels = False, cmap = 'viridis')

In [None]:
train_data2.info()

In [None]:
'''
Removing Unused Column
2) Name Has nothing to do with the prediction
'''

train_data3 = train_data2.drop(['Name'], axis = 1)

In [None]:
train_data3.info()

In [None]:
'''
Handling Categorical data here
Categorical data are of Object type.
let's identify them first.
1) Sex
2) Ticket
3)Embarked

Now let's use dummy encoding here
'''

embarked = pd.get_dummies(train_data3['Embarked'], drop_first = True)
#ticket = pd.get_dummies(train_data3['Ticket'], drop_first = True)
sex = pd.get_dummies(train_data3['Sex'], drop_first = True)

In [None]:
train_data3.info()

In [None]:
# Dropping Categorical Data for further procedure.
train_data3.drop(['Sex', 'Ticket', 'Embarked'], axis = 1, inplace = True)

In [None]:
train_data3.info()

In [None]:
train_data3 = pd.concat([train_data3, sex, embarked], axis = 1)

In [None]:
train_data3.head()

In [None]:
# looking at class values

train_data3['Survived'].value_counts()

In [None]:
# Arranging test Data
test_data['Age'] = train_data[['Age', 'Pclass']].apply(impute_age, axis = 1)
test_data2 = test_data.drop('Cabin', axis = 1)
test_data3 = test_data2.drop(['PassengerId', 'Name'], axis = 1)
embarked = pd.get_dummies(test_data3['Embarked'], drop_first = True)
#ticket = pd.get_dummies(test_data3['Ticket'], drop_first = True)
sex = pd.get_dummies(test_data3['Sex'], drop_first = True)
test_data3.drop(['Sex', 'Ticket', 'Embarked'], axis = 1, inplace = True)
test_data3 = pd.concat([test_data3, sex, embarked], axis = 1)

In [None]:
train_data3.info()


In [None]:
target = train_data3['Survived']

In [None]:
features = ['Pclass', 'SibSp', 'Parch','male','Q','S']

In [None]:
X = train_data3[features]
X_test = test_data3[features]

In [None]:
X

In [None]:
X_test.isnull().sum()

In [None]:
# train the model

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=300, max_depth=8, random_state=1)
model.fit(X, target)
predictions = model.predict(X_test)

In [None]:
predictions

In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)

In [None]:
output