In [None]:
import pandas as pd
import numpy as np

## DATA UNDERSTANDING

In [None]:
titanic_train = pd.read_csv(r'../input/titanic/train.csv')
titanic_train.head()

In [None]:
titanic_train.info()

There are 3 types of attributes in the dataset which we will be using for our analysis:
* **Categorical** : Survived, Sex, Cabin and Embarked
* **Ordinal** : Pclass, SibSp, Parch
* **Continous/Numerical** : Fare, Age

I am ignoring 'PassengerId','Name' and 'Ticket' fields, because they seem a bit uninformative. In such cases, a complex model may detect patterns like the fact that all names in the training data with 'A' or 'S' etc. in their name has higher survival rate. Doesn't seem reasonal. Same goes for the other two attributes.

In [None]:
titanic_train['Sex'].value_counts()

In [None]:
titanic_train['Ticket'].value_counts()

In [None]:
titanic_train['Cabin'].value_counts()

In [None]:
titanic_train['Embarked'].value_counts()
# sample size is heavily skewed in the favour of Southampton

In [None]:
titanic_train['Pclass'].value_counts()

In [None]:
titanic_train['Survived'].value_counts(normalize=True)

In [None]:
titanic_train['SibSp'].value_counts()

In [None]:
titanic_train['Parch'].value_counts()

In [None]:
titanic_train['Fare'].value_counts()

In [None]:
titanic_train.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
titanic_train.hist(bins=50, figsize=(20,15))

## EDA

In [None]:
# create a copy of the training dataset for performing exploratory data analysis
titanic = titanic_train.copy()
titanic.head()

With the help of a few visualization charts, we will try to find relationship between each predictor(attribute) and our label. 
This type of analysis is usually known as **Bivariate Analysis**.

In [None]:
import seaborn as sns

In [None]:
sns.countplot(x='Sex', hue ='Survived', data = titanic)
cross_tab = pd.crosstab(titanic.Sex, titanic.Survived).apply(lambda r:r*100/r.sum(), axis = 1)
print(cross_tab)

## very strong correlation between sex and survival rate.

In [None]:
sns.countplot(x='SibSp',hue='Survived',data=titanic)
cross_tab = pd.crosstab(titanic['SibSp'],titanic['Survived']).apply(lambda r: r*100/r.sum(), axis=1)
print(cross_tab)

In [None]:
sns.countplot(x='Parch',hue='Survived',data=titanic)
cross_tab = pd.crosstab(titanic['Parch'],titanic['Survived']).apply(lambda r: r*100/r.sum(), axis=1)
print(cross_tab)

In [None]:
# create a new column for passenger's travelling alone
titanic['is_alone'] = titanic['SibSp'] + titanic['Parch']
titanic['is_alone'] = titanic['is_alone'].apply(lambda x: 1 if x>0 else 0)

sns.countplot(x='is_alone',hue='Survived',data=titanic)
cross_tab = pd.crosstab(titanic['is_alone'],titanic['Survived']).apply(lambda r: r*100/r.sum(), axis=1)
print(cross_tab)

# passenger's travelling with family has a slightly better chance of survival

In [None]:
sns.countplot(x='Embarked',hue='Survived',data=titanic)
cross_tab = pd.crosstab(titanic['Embarked'],titanic['Survived']).apply(lambda r: r*100/r.sum(), axis=1)
print(cross_tab)

# people who embarked from Cherbourg has slightly better survival rate

In [None]:
sns.countplot(x='Pclass',hue='Survived',data=titanic)
cross_tab = pd.crosstab(titanic['Pclass'],titanic['Survived']).apply(lambda r: r*100/r.sum(), axis=1)
print(cross_tab)

# survival rate tends to decrease with the class in which a person is travelling 1> 2>3

In [None]:
graph = sns.FacetGrid(titanic, hue="Survived", palette="Set1", )
graph = graph.map(plt.hist,"Age", alpha=0.5)
graph.add_legend()
#passenger's less than 20(children), higher survival, 20-60 : poor survival (adults)

In [None]:
graph = sns.FacetGrid(titanic, hue="Survived", palette="Set1", )
graph = graph.map(plt.hist,"Fare", alpha=0.5)
graph.add_legend()
# higher the fare, better the survival rate

In [None]:
titanic['Cabin'] = titanic['Cabin'].fillna('NA')
titanic['Cabin'] = titanic.Cabin.apply(lambda x : 'No' if x == 'NA' else 'Yes')
titanic['Cabin'].value_counts()

In [None]:
sns.countplot(x='Cabin',hue='Survived',data=titanic)
cross_tab = pd.crosstab(titanic['Cabin'],titanic['Survived']).apply(lambda r: r*100/r.sum(), axis=1)
print(cross_tab)

**INSIGHTS :**
1. Females are more likely to survive
2. Passenger's in first_class has higher chances of survival, followed by second_class and thrid_class respectively
3. Passenger's who embarked from Cherbourg has slightly better survival rate
4. Passenger's less than 20(children), higher survival, 20-60 : poor survival (adults)
5. People with cabin's have better survival chances than those without
6. Passenger's travelling with family has higher survival rate

## DATA CLEANING

In [None]:
#seperate predicators and labels
titanic = titanic_train.drop(columns='Survived')
titanic_labels = titanic_train['Survived'].copy()

In [None]:
titanic.head()

In [None]:
titanic['Cabin'] = titanic['Cabin'].fillna('NA')
titanic['Cabin'] = titanic.Cabin.apply(lambda x : 'No' if x == 'NA' else 'Yes')

titanic['is_alone'] = titanic['SibSp'] + titanic['Parch']
titanic['is_alone'] = titanic['is_alone'].apply(lambda x: 1 if x>0 else 0)

titanic.head()

In [None]:
titanic = titanic.drop(columns=['PassengerId','Name','Ticket','SibSp', 'Parch'], axis=1)
titanic

## DATA PIPELINE

In [None]:
# custom DataFrameSelector class for column transformation 
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

In [None]:
# Create a pipeline for data cleaning 
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer

num_attrib = list(titanic[['Age','Fare','is_alone']])
cat_attrib = list(titanic[['Sex','Pclass','Cabin','Embarked']])

# the selector will select a column attributes from the given list
# imputer will fill missing numerical fields . in this case for age with median value i.e 28
# std_scalar for scaling numerical attributes

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attrib)),
    ('imputer',SimpleImputer(strategy='median')),
    ('std_scaler',StandardScaler())
])

# SimpleImputer fills missing values in the categorical fields with mode/ most frequent value
# Onehotencoding is performed for transforming categorical values as numerical

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attrib)),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder())
])

# create a full pipeline by combining results of subpipelines(num_pipeline, cat_pipeline)

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipepline',num_pipeline),
    ('cat_pipeline',cat_pipeline)
])

titanic_prepared = full_pipeline.fit_transform(titanic)
titanic_prepared

## MODEL TRAINING

In [None]:
# LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(titanic_prepared,titanic_labels)

In [None]:
some_data = titanic.iloc[:5]
some_labels = titanic_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print('Predictions:\t', log_reg.predict(some_data_prepared))
print('Labels:\t',list(some_labels))

In [None]:
# evaluate model
from sklearn.model_selection import cross_val_score

scores = cross_val_score(log_reg, titanic_prepared, titanic_labels, scoring='accuracy')
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

## EVALUATING THE MODEL ON TEST SET

In [None]:
# final_model with best possible hyperparamters
final_model = log_reg

test = pd.read_csv(r'../input/titanic/test.csv')
result = pd.read_csv(r'../input/titanic/gender_submission.csv')

X_test = test

X_test['Cabin'] = X_test.Cabin.fillna('NA')
X_test['Cabin'] = X_test.Cabin.apply(lambda x : 'NA' if x == 'No' else 'Yes')
X_test['is_alone'] = X_test['SibSp'] + X_test['Parch']
X_test['is_alone'] = X_test['is_alone'].apply(lambda x: 1 if x>0 else 0)

X_test = X_test.drop(columns=['PassengerId','Name','Ticket', 'SibSp','Parch'])

y_test = result['Survived'].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

# evaluate model
from sklearn.model_selection import cross_val_score

scores = cross_val_score(log_reg, X_test_prepared, y_test, scoring='accuracy')
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
predictions = np.abs(np.around(final_predictions))
predictions = predictions.astype(int)

In [None]:
passenger_id = list(test['PassengerId'])
prediction_submission = list(zip(passenger_id,predictions))
prediction_submission = pd.DataFrame(prediction_submission, columns = ('PassengerId','Survived'))
prediction_submission

In [None]:
prediction_submission=prediction_submission.to_csv("final_result.csv",index=False)
prediction_submission