# Titanic Competition by Kaggle
predict who's gonna survived :) 

In [4]:
import numpy as np 
import pandas as pd

import os
# print(os.listdir("../input"))

## Load Data 

In [5]:
train_original = pd.read_csv('../input/train.csv')
test_original = pd.read_csv('../input/test.csv')

#### look at the Data

In [6]:
train_original.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
test_original.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Work and play with a Copy of the Data
I'll work on a copy of the original data so nothing will get lost.  
also, I'll do the pre-processing on the training and the testing data.

In [8]:

datasets = [pd.DataFrame(train_original,copy=True),pd.DataFrame(test_original,copy=True)]

for df in datasets :
    print(df.info())
    print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null

# Pclass
A proxy for socio-economic status  
1st = Upper
2nd = Middle
3rd = Lower

no missing values | no categorical values

so lets check if it has impact on the result

In [9]:
# Impact
df = datasets[0]
print(df[['Survived','Pclass']].groupby(['Pclass']).mean())

        Survived
Pclass          
1       0.629630
2       0.472826
3       0.242363


good. keeping it!

# Family Size
SibSp - number of siblings/spouse  
Parch - number of parents/children

i'll sum it up to get the hole family size

In [10]:
# CREATE
for df in datasets :
    df['FamilySize'] = df.SibSp+df.Parch+1
    df.FamilySize = df.FamilySize.astype('int64')

# IMPACT
df = datasets[0]
print(df[['Survived','FamilySize']].groupby(['FamilySize']).mean())


            Survived
FamilySize          
1           0.303538
2           0.552795
3           0.578431
4           0.724138
5           0.200000
6           0.136364
7           0.333333
8           0.000000
11          0.000000


good. keeping it!

### Alone
acording to family size lets check a new feature - Alone or not  
if it has an impact - i'll keep it

In [11]:
# CREATE
for df in datasets :
    df['Alone'] = 0
    df.loc[df.FamilySize==1,'Alone'] = 1
    df.Alone = df.Alone.astype('int64')

# IMPACT
df = datasets[0]
print(df[['Survived','Alone']].groupby(['Alone']).mean())

       Survived
Alone          
0      0.505650
1      0.303538


good. keeping it!

# Sex
from catagory to binary  
~ male to 1  
~ female to 0

In [12]:
# CREATE
for df in datasets :
    df['IsMale'] = df.Sex.map(lambda s : 1 if s=='male' else 0)
    df.IsMale = df.IsMale.astype('int64')

# IMPACT
df = datasets[0]
print(df[['Survived','IsMale']].groupby(['IsMale']).mean())

        Survived
IsMale          
0       0.742038
1       0.188908


good. keeping it!

# Embark
embarked Port of Embarkation  
C = Cherbourg | Q = Queenstown | S = Southampton

~ fill out missing value  
~ numberize it  
~ check impact  

In [13]:
for df in datasets :
    # Fill Missing Values
    df.Embarked.fillna('S',inplace=True)
    
    # CREATE
    df['EmbarkedNumber'] = 0;

    # numberize it 
    dic = {'S':0,'C':1,'Q':2}
    df.EmbarkedNumber = df.Embarked.replace(dic,inplace=False)
    df.EmbarkedNumber = df.EmbarkedNumber.astype('int64')

# IMPACT
df = datasets[0]
print(df[['Survived','EmbarkedNumber']].groupby(['EmbarkedNumber']).mean())

                Survived
EmbarkedNumber          
0               0.339009
1               0.553571
2               0.389610


good. keeping it!

# Fare
price of the ticket

~ missing values    
~ devide for ranges  
~ check for impact  


In [14]:
# TRAIN
df = datasets[0]

# Missing Values - just in case
fare_median = df.Fare.median()
df.Fare.fillna(fare_median,inplace=True) 

# devide for Fare Range
df['FareRange'],bins = pd.qcut(df.Fare,4,labels=[0,1,2,3],retbins=True)
df.FareRange = df.FareRange.astype('int64')
# else :
#     df['FareRange'] = pd.cut(df.Fare,labels=[0,1,2,3],bins=bins)
# print(df['FareRange'].value_counts())

# IMPACT
print(df[['Survived','FareRange']].groupby(['FareRange']).mean())
###################

# TEST
df = datasets[1]

# Missing Values - just in case
df.Fare.fillna(fare_median,inplace=True) 

# devide for Fare Range
df['FareRange'] = pd.cut(df.Fare,labels=[0,1,2,3],bins=bins)
df.FareRange = df.FareRange.astype('int64')

           Survived
FareRange          
0          0.197309
1          0.303571
2          0.454955
3          0.581081


# Age
the age of the person

~ missing values    
~ devide for ranges  
~ check for impact  

In [15]:
# TRAIN
df = datasets[0]

# Missing Values
age_median = int(df.Age.median())
age_std = int(df.Age.std())
r = lambda : np.random.randint(age_median - age_std,age_median + age_std)
df.Age.fillna(r(), inplace=True)
df.Age = df.Age.astype('int64')

# devide for Fare Range
df['AgeRange'],bins = pd.qcut(df.Age,5,labels=[0,1,2,3,4],retbins=True)
df.AgeRange = df.AgeRange.astype('int64')

# IMPACT
print(df[['Survived','AgeRange']].groupby(['AgeRange']).agg(['mean','sum']))
#########

# TEST
df = datasets[1]

# Missing Values
df.Age.fillna(r(), inplace=True)
df.Age = df.Age.astype('int64')

# devide for Fare Range
df['AgeRange'] = pd.cut(df.Age,labels=[0,1,2,3,4],bins=bins)
df.AgeRange = df.AgeRange.astype('int64')

          Survived    
              mean sum
AgeRange              
0         0.455556  82
1         0.358696  66
2         0.304569  60
3         0.444444  68
4         0.372881  66


# Name to Title
extract the Title from the name

In [16]:

# TRAIN
df = datasets[0]

# CREATE
def get_title(name):
    import re
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

df['Title'] = df.Name.map(lambda n : get_title(n))

# IMPACT
# print(df[['Title','Survived']].groupby(['Title']).agg(['mean','sum']))

# FROM CATAGORY TO NUMBERS
titles = df[['Title','Survived']].groupby(['Title']).agg(['mean','sum']).reset_index()  
titlesRare = titles.Title[titles.Survived['sum']<titles.Survived['sum'].mean()].tolist()
titlesNotRare = titles.Title[titles.Survived['sum']>=titles.Survived['sum'].mean()].tolist()
di = {'Master':1, 'Miss':2, 'Mr':3, 'Mrs':4}
df['TitleNumber'] = df['Title'].replace(to_replace=titlesRare, value=0).replace(di)
df.TitleNumber = df.TitleNumber.astype('int64')

# IMPACT
df = datasets[0]
print(df[['TitleNumber','Survived']].groupby(['TitleNumber']).agg(['mean','sum']))
#################

df = datasets[1]
df['Title'] = df.Name.map(lambda n : get_title(n))
df['TitleNumber'] = df['Title'].replace(to_replace=titlesRare, value=0).replace(di).replace('Dona',0)
df.TitleNumber = df.TitleNumber.astype('int64')




             Survived     
                 mean  sum
TitleNumber               
0            0.444444   12
1            0.575000   23
2            0.697802  127
3            0.156673   81
4            0.792000   99


In [17]:
for df in datasets:
    print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   

   Parch            Ticket     Fare Cabin Embarked  FamilySize  Alone  IsMale  \
0      0         A/5 21171   7.2500   NaN        S           2      0       1   
1      0          PC 17599  71.2833   C85        C           2      0       0   
2      0  STON/O2. 3101282   7.9250   NaN        S           1      1       0   
3 

# Clean Data

~ drop cols  
~ drop catagory


In [18]:
datasets_clean = []

for df in datasets:
    # Drop SibSp & Parch cuz FamilySize & Alone
    # Drop Sex cuz IsMale
    # Drop Embarked cuz EmbarkedNumber
    # Drop Fare cuz FareRange
    # Drop Age cuz AgeRange
    # Drop Name & Title cuz TitleNumber
    cols_to_drop = ['SibSp','Parch','Sex','Fare','Embarked','Fare','Age','Name','Title']

    # Drop Column i did not have the power to extract valuable features
    cols_to_drop.append('Ticket')
    cols_to_drop.append('Cabin')

    datasets_clean.append(df.drop(columns=cols_to_drop,inplace=False,errors='ignore'))

for df in datasets_clean:
    print(df.info())
    print()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId       891 non-null int64
Survived          891 non-null int64
Pclass            891 non-null int64
FamilySize        891 non-null int64
Alone             891 non-null int64
IsMale            891 non-null int64
EmbarkedNumber    891 non-null int64
FareRange         891 non-null int64
AgeRange          891 non-null int64
TitleNumber       891 non-null int64
dtypes: int64(10)
memory usage: 69.7 KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId       418 non-null int64
Pclass            418 non-null int64
FamilySize        418 non-null int64
Alone             418 non-null int64
IsMale            418 non-null int64
EmbarkedNumber    418 non-null int64
FareRange         418 non-null int64
AgeRange          418 non-null int64
TitleNumber       418 non-null int64
dtypes: int64(9)
memory usage: 29.5 KB
Non

# Training-Fitting-Predict

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.ensemble import GradientBoostingClassifier
# checked RandomForestClassifier, AdaBoostClassifier
# checked LogisticRegressionCV
# checked SVC

df_clean = datasets_clean[0]

X_original = df_clean.loc[:, df_clean.columns != 'Survived']
y_original = df_clean.Survived

X_train, X_test, y_train, y_test = train_test_split(X_original,y_original,train_size=0.9,test_size=0.1,random_state=0)

model = GradientBoostingClassifier()
model.fit(X_train,y_train)
y_pre = model.predict(X_test)
target_names = ['0-not Survived', '1-Survived']
print(classification_report(y_test, y_pre, target_names=target_names))

                precision    recall  f1-score   support

0-not Survived       0.82      0.88      0.85        51
    1-Survived       0.83      0.74      0.78        39

   avg / total       0.82      0.82      0.82        90



In [23]:
df_clean = datasets_clean[1]

y_pre = model.predict(df_clean)

test_original['Survived'] = y_pre



Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0


In [33]:
# sub = pd.read_csv('../input/gender_submission.csv')
# sub.head()

my_submission = pd.DataFrame({'PassengerId': test_original.PassengerId, 'Survived': test_original.Survived})
my_submission.to_csv('submission.csv', index=False)