<a href="https://colab.research.google.com/github/straylight77/arcade-games/blob/master/titanic3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd

In [0]:
# https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy

data_raw = pd.read_csv('https://raw.githubusercontent.com/straylight77/data-sci-learning/master/titanic/train.csv')
data_val = pd.read_csv('https://raw.githubusercontent.com/straylight77/data-sci-learning/master/titanic/test.csv')

data1 = data_raw.copy(deep=True)
all_data = [data1, data_val]

data_raw.sample(5)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
352,353,0,3,"Elias, Mr. Tannous",male,15.0,1,1,2695,7.2292,,C
668,669,0,3,"Cook, Mr. Jacob",male,43.0,0,0,A/5 3536,8.05,,S
373,374,0,1,"Ringhini, Mr. Sante",male,22.0,0,0,PC 17760,135.6333,,C
649,650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23.0,0,0,CA. 2314,7.55,,S
37,38,0,3,"Cann, Mr. Ernest Charles",male,21.0,0,0,A./5. 2152,8.05,,S


In [0]:
# COMPLETE: fill in null values 
for ds in all_data:
    ds['Age'].fillna( ds['Age'].median(), inplace=True )
    ds['Embarked'].fillna( ds['Embarked'].mode()[0], inplace=True )
    ds['Fare'].fillna( ds['Fare'].median(), inplace=True )

# PassengerId and Ticket are unique identifiers, no impact to survivability
# too much Cabin data is missing to be useful
drop_columns = ['PassengerId', 'Cabin', 'Ticket']
data1.drop(drop_columns, axis=1, inplace=True)

print(data1.isnull().sum())
print("-"*10)
print(data_val.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
----------
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


In [0]:
# CREATE: Feature Engineering 
for ds in all_data:
    ds['Relatives'] = ds['SibSp'] + ds['Parch']

    ds['IsAlone'] = ds['Relatives'].apply(lambda x: 0 if x > 0 else 1) 

    # qcut vs cut: https://stackoverflow.com/questions/30211923/what-is-the-difference-between-pandas-qcut-and-pandas-cut
    # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
    ds['FareBin'] = pd.qcut(ds['Fare'], 4)
    
    # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
    ds['AgeBin'] = pd.cut(ds['Age'].astype(int), 5)

    # extract the passenger's title from their name (Mr, Miss, Dr, etc.)
    ds['Title'] = ds['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]


# replace rare Titles (< stat_min) with 'Other'
stat_min = 10 
title_names = (data1['Title'].value_counts() < stat_min)
data1['Title'] = data1['Title'].apply(lambda x: 'Other' if title_names.loc[x] == True else x)

data1.sample(5)
#data1['Title'].value_counts()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Relatives,IsAlone,FareBin,AgeBin,Title
716,1,1,"Endres, Miss. Caroline Louise",female,38.0,0,0,227.525,C,0,1,"(31.0, 512.329]","(32.0, 48.0]",Miss
651,1,2,"Doling, Miss. Elsie",female,18.0,0,1,23.0,S,1,0,"(14.454, 31.0]","(16.0, 32.0]",Miss
855,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,9.35,S,1,0,"(7.91, 14.454]","(16.0, 32.0]",Mrs
134,0,2,"Sobey, Mr. Samuel James Hayden",male,25.0,0,0,13.0,S,0,1,"(7.91, 14.454]","(16.0, 32.0]",Mr
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,23.45,S,3,0,"(14.454, 31.0]","(16.0, 32.0]",Miss


In [0]:
# CONVERT: change to category types 
from sklearn import preprocessing

label = preprocessing.LabelEncoder()
for ds in all_data:    
    ds['Sex_Code'] = label.fit_transform(ds['Sex'])
    ds['Embarked_Code'] = label.fit_transform(ds['Embarked'])
    ds['Title_Code'] = label.fit_transform(ds['Title'])
    ds['AgeBin_Code'] = label.fit_transform(ds['AgeBin'])
    ds['FareBin_Code'] = label.fit_transform(ds['FareBin'])

#data1.sample(10)
data1.info()
print("-"*10)
data_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              891 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Fare             891 non-null float64
Embarked         891 non-null object
Relatives        891 non-null int64
IsAlone          891 non-null int64
FareBin          891 non-null category
AgeBin           891 non-null category
Title            891 non-null object
Sex_Code         891 non-null int64
Embarked_Code    891 non-null int64
Title_Code       891 non-null int64
AgeBin_Code      891 non-null int64
FareBin_Code     891 non-null int64
dtypes: category(2), float64(2), int64(11), object(4)
memory usage: 120.7+ KB
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 21 columns):
Pas

In [0]:
# set up variables for training our models

Target = ['Survived']

#define x variables for original features aka feature selection
data1_x = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'Relatives', 'IsAlone'] #pretty name/values for charts
data1_x_calc = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare'] #coded for algorithm calculation
data1_xy =  Target + data1_x
print('Original X Y: ', data1_xy, '\n')


#define x variables for original w/bin features to remove continuous variables
data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'Relatives', 'AgeBin_Code', 'FareBin_Code']
data1_xy_bin = Target + data1_x_bin
print('Bin X Y: ', data1_xy_bin, '\n')


#define x and y variables for dummy features original
data1_dummy = pd.get_dummies(data1[data1_x])
data1_x_dummy = data1_dummy.columns.tolist()
data1_xy_dummy = Target + data1_x_dummy
print('Dummy X Y: ', data1_xy_dummy, '\n')


Original X Y:  ['Survived', 'Sex', 'Pclass', 'Embarked', 'Title', 'SibSp', 'Parch', 'Age', 'Fare', 'Relatives', 'IsAlone'] 

Bin X Y:  ['Survived', 'Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'Relatives', 'AgeBin_Code', 'FareBin_Code'] 

Dummy X Y:  ['Survived', 'Pclass', 'SibSp', 'Parch', 'Age', 'Fare', 'Relatives', 'IsAlone', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Other'] 



In [0]:
# reserve some of our data for validation after training
from sklearn import model_selection

train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(data1[data1_x_calc], data1[Target], random_state = 0)
train1_x_bin, test1_x_bin, train1_y_bin, test1_y_bin = model_selection.train_test_split(data1[data1_x_bin], data1[Target] , random_state = 0)
train1_x_dummy, test1_x_dummy, train1_y_dummy, test1_y_dummy = model_selection.train_test_split(data1_dummy[data1_x_dummy], data1[Target], random_state = 0)


print("Data1 Shape: {}".format(data1.shape))
print("Train1 Shape: {}".format(train1_x.shape))
print("Test1 Shape: {}".format(test1_x.shape))

train1_x_bin.head()

Data1 Shape: (891, 19)
Train1 Shape: (668, 8)
Test1 Shape: (223, 8)


Unnamed: 0,Sex_Code,Pclass,Embarked_Code,Title_Code,Relatives,AgeBin_Code,FareBin_Code
105,1,3,2,2,0,1,0
68,0,3,2,1,6,1,1
253,1,3,2,2,1,1,2
320,1,3,2,2,0,1,0
706,0,2,2,3,0,2,1


In [0]:
from sklearn import ensemble
from sklearn import linear_model

MLA = [
    ensemble.RandomForestClassifier(),
    linear_model.LogisticRegressionCV()
]


#split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit
#note: this is an alternative to train_test_split
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = data1[Target]

#index through MLA and save performance to table
row_index = 0
for alg in MLA:

    #set name and parameters
    MLA_name = alg.__class__.__name__
    #MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    #MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())

    #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
    cv_results = model_selection.cross_validate(alg, data1[data1_x_bin], data1[Target], cv  = cv_split)

    #MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    #MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    #MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    #MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    

    #save MLA predictions - see section 6 for usage
    alg.fit(data1[data1_x_bin], data1[Target])
    MLA_predict[MLA_name] = alg.predict(data1[data1_x_bin])


    row_index += 1

MLA_compare

In [0]:
# Just use RandomForest
from sklearn import ensemble

#model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model = ensemble.RandomForestClassifier(n_estimators=100)
model.fit(data1[data1_x_bin], data1[Target])

predictions = model.predict(data_val[data1_x_bin])

output_df = pd.DataFrame({'PassengerId': data_val.PassengerId, 'Survived': predictions})

output_df


  """


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
