In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv('../input/titanicdataset-traincsv/train.csv')

The Data
Let's start by reading in the titanic_train.csv file into a pandas dataframe.

In [None]:
df.head()

In [None]:
df.shape


Exploratory Data Analysis
Let's begin some exploratory data analysis! We'll start by checking out missing data!

Missing Data
We can use seaborn to create a simple heatmap to see where we are missing data!

In [None]:
df.isnull()

In [None]:
df.notnull()

In [None]:
df.describe()

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

Roughly 20 percent of the Age data is missing. The proportion of Age missing is likely small enough for reasonable replacement with some form of imputation. Looking at the Cabin column, it looks like we are just missing too much of that data to do something useful with at a basic level. We'll probably drop this later, or change it to another feature like "Cabin Known: 1 or 0"

In [None]:
plt.figure(figsize=(15,10))
sns.set_style('darkgrid')
sns.countplot(x='Survived', data=df)

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(x='Survived',hue='Sex', data=df, palette='rainbow')

In [None]:
sns.countplot(x='Survived',hue='Pclass',data=df,palette='rainbow')

In [None]:
sns.distplot(df['Age'].dropna(),kde=True, rug=True,color='darkred',bins=40)

In [None]:
sns.jointplot(x=df['Age'], y=df['Survived'], kind='kde')

In [None]:
sns.violinplot(x='Age', y='Sex', data=df)


In [None]:
sns.pairplot(data=df)

In [None]:
df['Age'].hist(bins=30,color='darkred',alpha=0.3)

In [None]:
sns.countplot(x='SibSp',data=df)

In [None]:
df['Fare'].hist(color='green',bins=40,figsize=(8,4))

In [None]:
plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass',y='Age',data=df,palette='winter')

We want to fill in missing age data instead of just dropping the missing age data rows. One way to do this is by filling in the mean age of all the passengers (imputation). However we can be smarter about this and check the average age by passenger class. For example:

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 35

        elif Pclass == 2:
            return 28

        else:
            return 22

    else:
        return Age

We can see the wealthier passengers in the higher classes tend to be older, which makes sense. We'll use these average age values to impute based on Pclass for Age

In [None]:
df['Age'] = df[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
df.drop('Cabin', axis=1, inplace=True)

In [None]:
df.columns

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
features = ['Fare', 'Pclass', 'Sex', 'Embarked']
X = df[features]
y = df.Survived

In [None]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc_model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=1)
rfc_model.fit(train_X, train_y)

from sklearn.metrics import mean_absolute_error
val_predictions = rfc_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))