# Titanic Survival Exploration

In [219]:
import pandas as pd
from pandas import Series,DataFrame
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline

# Loading Data

**1.** First step for any problem would be to load data and pre-process such that data is accesible to us.

In [220]:

# Read required data sets.
train_data = pd.read_csv('data/train.csv')
test_data  = pd.read_csv('data/test.csv')
test_df  = pd.read_csv('data/test.csv')
print(train_data.info(), train_data.head())
print(test_data.info(), test_data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None    PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence

# Preprocessing Data
## Dropping columns which don't add value.

In [221]:
train_data.drop( "Name", axis=1, inplace = True)
train_data.drop( "Ticket", axis=1, inplace = True)
train_data.drop( "PassengerId", axis=1, inplace = True)

test_data.drop( "Name", axis=1, inplace = True)
test_data.drop( "Ticket", axis=1, inplace = True)
test_data.drop( "PassengerId", axis=1, inplace = True)


## Handling Missing Values

Age :- Replace by mean

Cabin :- Drop the column since lot of values are missing.

Embarked :- Takes 3 entries, replace with the one that has been used the most.

In [222]:
train_data.Age.fillna( train_data.Age.mean(),inplace=True )
train_data.drop( "Cabin", axis=1, inplace = True)
train_data.Embarked.fillna('S',inplace=True)
print(train_data.isnull().sum())

test_data.Age.fillna( test_data.Age.mean(),inplace=True )
test_data.Fare.fillna( test_data.Fare.mean(),inplace=True )
test_data.drop( "Cabin", axis=1, inplace = True)
test_data.Embarked.fillna('S',inplace=True)

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


## Coverting Categorical Data to Numerical Values

Sex and Embarked can be coverted to numerical values

In [223]:
enc = LabelEncoder()
train_data.Sex = enc.fit_transform(train_data.Sex)
train_data.Embarked = enc.fit_transform(train_data.Embarked)
print(train_data.head())

test_data.Sex = enc.fit_transform(test_data.Sex)
test_data.Embarked = enc.fit_transform(test_data.Embarked)

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    1  22.0      1      0   7.2500         2
1         1       1    0  38.0      1      0  71.2833         0
2         1       3    0  26.0      0      0   7.9250         2
3         1       1    0  35.0      1      0  53.1000         2
4         0       3    1  35.0      0      0   8.0500         2


In [224]:
target_train    = train_data.Survived
features_train = train_data
features_train.drop('Survived',inplace=True,axis=1)
print(features_train.head())


features_test = test_data

   Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0       3    1  22.0      1      0   7.2500         2
1       1    0  38.0      1      0  71.2833         0
2       3    0  26.0      0      0   7.9250         2
3       1    0  35.0      1      0  53.1000         2
4       3    1  35.0      0      0   8.0500         2


## Normalizing the Data

In [225]:
scaler = MinMaxScaler()
features_train[:] = scaler.fit_transform( features_train[:] )
print( features_train.head())

   Pclass  Sex       Age  SibSp  Parch      Fare  Embarked
0     1.0  1.0  0.271174  0.125    0.0  0.014151       1.0
1     0.0  0.0  0.472229  0.125    0.0  0.139136       0.0
2     1.0  0.0  0.321438  0.000    0.0  0.015469       1.0
3     0.0  0.0  0.434531  0.125    0.0  0.103644       1.0
4     1.0  1.0  0.434531  0.000    0.0  0.015713       1.0


## Training and Cross validation data

In [226]:
X_train,X_test,y_train,y_test = train_test_split(features_train,target_train,random_state=42)

### Using Decision Tree Classifier to predict

In [238]:
from sklearn  import tree

clf = tree.DecisionTreeClassifier()

clf.fit(features_train,target_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [249]:
predictions = clf.predict(test_data)

In [250]:
print(predictions)

[1 0 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 0 0 1 1 0
 0 1 1 1 1 1 0 0 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 0 0 1 0 1
 1 1 1 0 1 0 1 1 1 1 1 1 0 0 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1
 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1
 1 1 0 1 1 0 1 1 0 0 1 0 0 1 0 1 1 0 1 1 1 0 1 1 1 1 1 0 0 1 0 0 1 1 0 1 0
 1 0 1 0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 0 1 1 0 1 1 1 1 1 1 0 0 1 0 1 0 1 1 1
 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0
 1 1 1 0 0 1 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1
 0 1 1 1 1 1 1 1 0 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1
 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 0 1 0 1 1 0 1 0 0 1 1 0 1 0 0 1
 1 1 0 1 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 1 0 1 0 1 1 1 1
 1 0 0 0 0 0 1 1 1 1 1]


In [251]:
predictions = best_clf.predict(test_data)

In [252]:
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                            'Survived': predictions })
submission.to_csv("submission.csv", index=False)

In [None]:
This model produced 70