# Data Analytics Process: An example with the Titanic dataset

In this notebook we do a demonstration of different phases in the data analytics process

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

## 1. Define a Question or a Problem to be solved

### Question: 
### *Which variables were associated with survival in the Titanic*

## 2. Data Acquisition

In [None]:
# Let's get some data
path_to_file = '..\\datasets\\titanic\\titanic.csv'
titanic = pd.DataFrame.from_csv(path=path_to_file, index_col=None)

### Know your data set

* What do you have en every row?
* What do you have in every column?
* What are the types of values you have?
* How big is the data set?
* Many other questions...

In [None]:
titanic.shape

In [None]:
titanic.info()

In [None]:
titanic.head()

## 3. Data Wrangling

### Cleaning the data

In [None]:
titanic.info()

In [None]:
titanic.tail()

In [None]:
# Rows that have less than 5 not null
titanic[titanic.notnull().sum(axis=1)<5]

Get rid of columns (variables, features) that are not useful and rows that contain no data.

In [None]:
#Droping columns
titanic.drop(['ticket','boat','body','home.dest'], axis=1, inplace=True)
#Droping rows
titanic.drop([1309], axis=0, inplace=True)

In [None]:
titanic.info()

### Feature Engineering

In [None]:
# We migth want to explore if people who were traveling alone had a different chance of survival
# Creating the "alone" variable (1 if alone)
titanic['alone'] =  titanic["parch"] + titanic["sibsp"]
titanic['alone'].ix[titanic['alone'] == 0] = 'Alone'
titanic['alone'].ix[titanic['alone'] != 'Alone'] = 'With Family'

In [None]:
# Transforming variable types
# example
titanic['pclass'] = titanic['pclass'].astype(int)

In [None]:
print titanic['pclass'].dtype

### Missing value imputation

In [None]:
titanic['age'].fillna(titanic['age'].median(), inplace=True)
titanic['embarked'].fillna(titanic['embarked'].value_counts().idxmax(), inplace=True)

## 4. Data Exploration

In [None]:
for x in titanic.columns:
    print x,

### Univariate exploration

In [None]:
titanic.describe()

#### Survived Variable

In [None]:
titanic['survived'].value_counts()

In [None]:
titanic['survived'].value_counts().plot(kind='bar')

In [None]:
titanic['survived'].mean() # only about 38% of the people suvived

#### Age variable

In [None]:
titanic['age'].describe()

In [None]:
fig, ax = plt.subplots()
titanic['age'].plot(kind='hist', bins = 20, ax=ax)
ax.set_title('Age distribution')

### Exploring relationships

In [None]:
sns.factorplot(y='survived', x='sex', data=titanic, size=5, aspect=2)

In [None]:
sns.factorplot(y= 'survived', x='embarked', data=titanic, size=5, aspect=2)

In [None]:
sns.factorplot(y= 'survived', x='alone', data=titanic, size=5, aspect=2)

In [None]:
sns.boxplot(x='survived', y='age', data=titanic)

In [None]:
facet = sns.FacetGrid(titanic, hue="survived",aspect=4)
facet.map(sns.kdeplot,'age', shade= True)
facet.set(xlim=(0, titanic['age'].max()))
facet.add_legend()

## 5. Modelling

Choose a model that will help you to answer the question

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
titanic.columns

In [None]:
# Again some wrangling is necessary before modelling
titanic['alone2'] = 0
titanic['alone2'].ix[titanic['alone']=='Alone'] = 1
titanic['male'] = pd.get_dummies(titanic['sex'], prefix='gender', drop_first=True)
titanic = pd.concat([titanic, pd.get_dummies(titanic['embarked'], prefix='embarked', drop_first=True)], axis=1)

In [None]:
titanic.columns

In [None]:
features = ['pclass','male','age','alone2','embarked_Q','embarked_S']
X = titanic[features].values
y = titanic['survived'].values

In [None]:
rf_model = RandomForestClassifier(random_state=124)
rf_model.fit(X,y)

In [None]:
rf_model.feature_importances_

In [None]:
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
    print("Rank: {}, {}, {:0.3f}".format(f + 1, features[indices[f]], importances[indices[f]]))

In [None]:
list(pd.Series(features)[indices])

In [None]:
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="g", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])

In [None]:
rf_model2 = RandomForestClassifier(random_state=124, min_samples_leaf=10)
rf_model2.fit(X,y)

importances2 = rf_model2.feature_importances_
std2 = np.std([tree.feature_importances_ for tree in rf_model2.estimators_], axis=0)
indices2 = np.argsort(importances2)[::-1]

# Print the feature ranking
print("Feature ranking, model 2:")
for f in range(X.shape[1]):
    print("Rank: {}, {}, {:0.3f}".format(f + 1, features[indices2[f]], importances2[indices2[f]]))

## 7. Comunication

In [None]:
# Plot the feature importances of the forest
plt.figure(figsize=(12,6))
plt.title("Feature Importances", size=22)
plt.bar(range(X.shape[1]), importances[indices], color="#FFA500", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), ["Age","Gender","Pass. Class", "Alone", "Embarked_S","Embarked_Q"], size=14)
plt.ylabel('Normalized Feature Importance (%)', size=15)
plt.xlim([-1, X.shape[1]]);