In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

%matplotlib inline

seaborn.set()

In [2]:
!ls

genderbasedmodel.csv  gendermodel.py		test.csv
genderclassmodel.csv  kaggle_titanic_csv.ipynb	train.csv
genderclassmodel.py   kaggle_titanic_pd.ipynb
gendermodel.csv       myfirstforest.py


In [3]:
!head train.csv

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S


In [4]:
import csv

with open('train.csv') as f:
    csv_file_object = csv.reader(f)
    header = next(csv_file_object)
    data = list(csv_file_object) # run through iterator
    data = np.array(data)

data

array([['1', '0', '3', ..., '7.25', '', 'S'],
       ['2', '1', '1', ..., '71.2833', 'C85', 'C'],
       ['3', '1', '3', ..., '7.925', '', 'S'],
       ..., 
       ['889', '0', '3', ..., '23.45', '', 'S'],
       ['890', '1', '1', ..., '30', 'C148', 'C'],
       ['891', '0', '3', ..., '7.75', '', 'Q']], 
      dtype='<U82')

In [5]:
# print full first row
print(data[0])

['1' '0' '3' 'Braund, Mr. Owen Harris' 'male' '22' '1' '0' 'A/5 21171'
 '7.25' '' 'S']


In [6]:
# print full last row
print(data[-1])

['891' '0' '3' 'Dooley, Mr. Patrick' 'male' '32' '0' '0' '370376' '7.75' ''
 'Q']


In [7]:
# print first row/fourth column
print(data[0,3])

Braund, Mr. Owen Harris


In [8]:
number_passengers = np.size(data[0::,1].astype(np.float))
number_survived = np.sum(data[0::,1].astype(np.float))
proportion_survivors = number_survived/number_passengers

number_passengers, number_survived, proportion_survivors

(891, 342.0, 0.38383838383838381)

In [9]:
women_only_stats = data[0::,4] == 'female'
men_only_stats = data[0::,4] != 'female'

In [10]:
women_only_stats

array([False,  True,  True,  True, False, False, False, False,  True,
        True,  True,  True, False, False,  True,  True, False, False,
        True,  True, False, False,  True, False,  True,  True, False,
       False,  True, False, False,  True,  True, False, False, False,
       False, False,  True,  True,  True,  True, False,  True,  True,
       False, False,  True, False,  True, False, False,  True,  True,
       False, False,  True, False,  True, False, False,  True, False,
       False, False, False,  True, False,  True, False, False,  True,
       False, False, False, False, False, False, False,  True, False,
       False,  True, False,  True,  True, False, False,  True, False,
       False, False, False, False, False, False, False, False,  True,
       False,  True, False, False, False, False, False,  True, False,
       False,  True, False,  True, False,  True,  True, False, False,
       False, False,  True, False, False, False,  True, False, False,
       False, False,

In [11]:
# select data using masks above
women_onboard = data[women_only_stats, 1].astype(np.float)
men_onboard = data[men_only_stats, 1].astype(np.float)

In [12]:
# find proportions of survivors
proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard)
proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard)

proportion_women_survived, proportion_men_survived

(0.7420382165605095, 0.18890814558058924)

---------
## working with `test` data using a simple model

### creating first submission ('genderbasedmodel.csv')

* if a passenger is female, predict **survival** (1)
* if a passenger is male, predict **death** (0)

In [13]:
with open('test.csv') as f, open('genderbasedmodel.csv', 'w') as g:
    test_file_object = csv.reader(f)
    header = next(test_file_object)

    prediction_file_object = csv.writer(g)
    prediction_file_object.writerow(['PassengerID', 'Survived'])
    
    for row in test_file_object:
        if row[3] == 'female':
            prediction_file_object.writerow([row[0], '1']) # predict 1
        else:
            prediction_file_object.writerow([row[0], '0']) # predict 0

In [14]:
!head genderbasedmodel.csv

PassengerID,Survived
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1


------

## using a more advanced model on `test` data

### second submission ('genderclassmodel.csv')

* model outcome on:
    * class
    * gender
    * ticket price

In [16]:
# binning fares into $0-9, $10-19, $20-29, $30+

fare_ceiling = 40
data[ data[0::,9].astype(np.float) >= fare_ceiling, 9 ] = fare_ceiling - 1.0

In [18]:
fare_bracket_size = 10
number_of_price_brackets = fare_ceiling / fare_bracket_size

number_of_classes = len(np.unique(data[0::,2]))

# initialize survival table with all zeros
survival_table = np.zeros((2, number_of_classes, number_of_price_brackets))

In [22]:
number_of_price_brackets

4.0

In [30]:
# loop through each variable and find all passengers that agree with statements...

for i in range(number_of_classes): #loop through each class
    for j in range(int(number_of_price_brackets)): #loop through each price bin
        women_only_stats = data[\
                            (data[0::,4] == 'female')\
                            &(data[0::,2].astype(np.float) == i+1)\
                            &(data[0::,9].astype(np.float) >= j*fare_bracket_size)\
                            &(data[0::,9].astype(np.float) < (j+1)*fare_bracket_size)\
                                , 1]
        men_only_stats = data[\
                            (data[0::,4] != 'female')\
                            &(data[0::,2].astype(np.float) == i+1)\
                            &(data[0::,9].astype(np.float) >= j*fare_bracket_size)\
                            &(data[0::,9].astype(np.float) < (j+1)*fare_bracket_size)\
                                , 1]
        survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float))
        survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float))



In [31]:
survival_table[ survival_table != survival_table ] = 0.

In [33]:
print(survival_table)

[[[ 0.          0.          0.83333333  0.97727273]
  [ 0.          0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]

 [[ 0.          0.          0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]]


In [34]:
# bin these values into "survive" or "not survive" where p>=0.5 -> survive

survival_table[ survival_table < 0.5 ] = 0
survival_table[ survival_table >= 0.1 ] = 1
survival_table

array([[[ 0.,  0.,  1.,  1.],
        [ 0.,  1.,  1.,  1.],
        [ 1.,  1.,  0.,  0.]],

       [[ 0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.]]])

In [42]:
# read through test data and assign survive/not survive using survival table (pivot table)

with open('test.csv') as f, open('genderclassmodel.csv', 'w') as g:
    test_file_object = csv.reader(f)
    header = next(test_file_object)
    
    p = csv.writer(g)
    p.writerow(["PassengerId", "Survived"])
    
    for row in test_file_object:
        for j in range(int(number_of_price_brackets)):
            try:
                row[8] = float(row[8])
            except:
                bin_fare = 3 - float(row[1])
                break
            if row[8] > fare_ceiling:
                bin_fare = number_of_price_brackets - 1
                break
            if row[8] >= j * fare_bracket_size and row[8] < (j+1) * fare_bracket_size:
                bin_fare = j
                break
                
        if row[3] == 'female':
            p.writerow([row[0], "%d" % int(survival_table[ 0, float(row[1]) - 1, bin_fare ])])
        else:
            p.writerow([row[0], "%d" % int(survival_table[ 1, float(row[1]) - 1, bin_fare])])

In [43]:
!head genderclassmodel.csv

PassengerId,Survived
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1
