### Similar to before, but this time I'll play with the features a bit

In [1]:
from autogluon import TabularPrediction as task
import pandas as pd

In [2]:
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


##### Survival Rate Per Class

In [4]:
pd.crosstab(index=train_data['Pclass'], columns=train_data['Survived'])

Survived,0,1
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80,136
2,97,87
3,372,119


##### Affect of Age on Survival

In [5]:
train_data['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [6]:
# Recode age into buckets
age_cut_bins = [0, 3, 12, 18, 40, 60, 90]
age_cut_labels = ['young child','child', 'teen', 'adult', 'older adult', 'elderly']

train_data['Age'] = pd.cut(train_data['Age'], bins=age_cut_bins, labels=age_cut_labels)
test_data['Age'] = pd.cut(test_data['Age'], bins=age_cut_bins, labels=age_cut_labels)

pd.crosstab(index=train_data['Age'], columns=train_data['Survived'])

Survived,0,1
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
young child,10,20
child,19,20
teen,40,30
adult,260,165
older adult,78,50
elderly,17,5


##### Only keep features that I think intuitively would matter

In [7]:
# feature_cols = ['Pclass', 'Sex', 'Age_bin', 'Embarked']
# test_data = test_data[feature_cols]

# feature_cols.append('Survived')
# train_data = train_data[feature_cols]

In [8]:
predictor = task.fit(train_data=task.Dataset(df=train_data), label='Survived')
predictions = predictor.predict(task.Dataset(df=test_data))

No output_directory specified. Models will be saved in: AutogluonModels/ag-20201009_043819/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20201009_043819/
AutoGluon Version:  0.0.14
Train Data Rows:    891
Train Data Columns: 11
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    6503.4 MB
	Train Data (Original)  Memory Usage: 0.31 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dt

In [9]:
# Prediction to series
predictions_ser = pd.Series(predictions)

# Load passenger IDs, combine with predictions
test_ids = pd.read_csv("../data/test.csv")
test_ids = test_ids['PassengerId']

submission = pd.DataFrame({'PassengerId': test_ids, 'Survived': predictions_ser})

In [10]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0


In [11]:
submission.to_csv("../submissions/kaggle_titanic_autogluon_engineered.csv", index=False)

##### Under the Hood

In [12]:
predictor.fit_summary().values

*** Summary of fit() ***
Estimated performance of each model:
                         model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      weighted_ensemble_k0_l1   0.865922       0.178993  8.322848                0.000510           0.257026            1       True         12
1     LightGBMClassifierCustom   0.832402       0.009358  0.409891                0.009358           0.409891            0       True         11
2           LightGBMClassifier   0.832402       0.009634  0.413623                0.009634           0.413623            0       True          7
3           CatboostClassifier   0.826816       0.008275  0.645415                0.008275           0.645415            0       True          9
4     ExtraTreesClassifierGini   0.826816       0.035844  0.322851                0.035844           0.322851            0       True          3
5          NeuralNetClassifier   0.821229       0.030433  5.232455  

<function dict.values>

In [13]:
predictor.leaderboard()

                         model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      weighted_ensemble_k0_l1   0.865922       0.178993  8.322848                0.000510           0.257026            1       True         12
1     LightGBMClassifierCustom   0.832402       0.009358  0.409891                0.009358           0.409891            0       True         11
2           LightGBMClassifier   0.832402       0.009634  0.413623                0.009634           0.413623            0       True          7
3           CatboostClassifier   0.826816       0.008275  0.645415                0.008275           0.645415            0       True          9
4     ExtraTreesClassifierGini   0.826816       0.035844  0.322851                0.035844           0.322851            0       True          3
5          NeuralNetClassifier   0.821229       0.030433  5.232455                0.030433           5.232455            0       T

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,weighted_ensemble_k0_l1,0.865922,0.178993,8.322848,0.00051,0.257026,1,True,12
1,LightGBMClassifierCustom,0.832402,0.009358,0.409891,0.009358,0.409891,0,True,11
2,LightGBMClassifier,0.832402,0.009634,0.413623,0.009634,0.413623,0,True,7
3,CatboostClassifier,0.826816,0.008275,0.645415,0.008275,0.645415,0,True,9
4,ExtraTreesClassifierGini,0.826816,0.035844,0.322851,0.035844,0.322851,0,True,3
5,NeuralNetClassifier,0.821229,0.030433,5.232455,0.030433,5.232455,0,True,10
6,RandomForestClassifierEntr,0.821229,0.040029,0.430527,0.040029,0.430527,0,True,2
7,ExtraTreesClassifierEntr,0.815642,0.034025,0.337128,0.034025,0.337128,0,True,4
8,LightGBMClassifierXT,0.810056,0.009832,0.182218,0.009832,0.182218,0,True,8
9,RandomForestClassifierGini,0.810056,0.036896,0.414565,0.036896,0.414565,0,True,1


In [14]:
predictor.feature_importance(train_data)

Computing raw permutation importance for 11 features on weighted_ensemble_k0_l1 ...
	6.46s	= Expected runtime
	5.49s	= Actual runtime


Sex            0.172840
Name           0.151515
Pclass         0.074074
Fare           0.048260
PassengerId    0.038159
Ticket         0.032548
Cabin          0.024691
Age            0.019080
SibSp          0.015713
Embarked       0.014590
Parch          0.010101
dtype: float64

In [15]:
predictor.feature_importance(feature_stage='transformed')

Computing raw permutation importance for 31 features on weighted_ensemble_k0_l1 ...
	7.97s	= Expected runtime
	8.18s	= Actual runtime


Sex                    0.117318
Pclass                 0.078212
Fare                   0.039106
Ticket                 0.039106
__nlp__.mr             0.033520
Name.lower_ratio       0.027933
Age                    0.027933
PassengerId            0.027933
Embarked               0.027933
Cabin                  0.022346
Parch                  0.016760
Name.char_count        0.016760
__nlp__.master         0.016760
Name.symbol_ratio..    0.016760
Name.special_ratio     0.016760
Name.capital_ratio     0.016760
SibSp                  0.011173
__nlp__.miss           0.011173
Name.symbol_count.     0.011173
__nlp__.mrs            0.011173
Name.symbol_count..    0.005587
Name.word_count        0.005587
Name.symbol_ratio.     0.005587
Name.symbol_count.-    0.005587
__nlp__.william        0.000000
Name.symbol_count./    0.000000
Name.symbol_ratio.-    0.000000
__nlp__.henry          0.000000
__nlp__.john           0.000000
__nlp__.mr william     0.000000
__nlp__._total_        0.000000
dtype: f