In [1]:
%matplotlib inline
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() # 플롯의 seaborn 기본값 설정

In [2]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

### Feature engineering

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### feature engineering base
- 결측치 데이터있는 Age, Cabin, Embarked 제거
- PassengerID 제거 -> 의미 없음
- Ticket, Name 제거 -> 당장 처리가 어려움
- Sex male : 0, female : 1


In [4]:
train_Y = df_train['Survived']
train_X = df_train.drop(['PassengerId','Survived','Age','Cabin','Name','Ticket','Embarked'],axis=1)
test_X = df_test.drop(['PassengerId','Age','Cabin','Name','Ticket','Embarked'],axis=1)

In [5]:
sex_mapping = {'male':0,'female':0}
train_X['Sex'] = train_X['Sex'].map(sex_mapping)
test_X['Sex'] = test_X['Sex'].map(sex_mapping)

In [6]:
test_X['Fare'] = test_X['Fare'].fillna(test_X['Fare'].mean())

In [7]:
test_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  418 non-null    int64  
 1   Sex     418 non-null    int64  
 2   SibSp   418 non-null    int64  
 3   Parch   418 non-null    int64  
 4   Fare    418 non-null    float64
dtypes: float64(1), int64(4)
memory usage: 16.5 KB


In [8]:
train_X

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare
0,3,0,1,0,7.2500
1,1,0,1,0,71.2833
2,3,0,0,0,7.9250
3,1,0,1,0,53.1000
4,3,0,0,0,8.0500
...,...,...,...,...,...
886,2,0,0,0,13.0000
887,1,0,0,0,30.0000
888,3,0,1,2,23.4500
889,1,0,0,0,30.0000


In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
model = RandomForestClassifier(n_estimators=5, random_state=0)
model.fit(train_X, train_Y)

In [11]:
model.score(train_X,train_Y)

0.819304152637486

In [12]:
test_Y = model.predict(test_X)

In [13]:
sub_df = pd.read_csv('./gender_submission.csv')
sub_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [14]:
sub_df['Survived'] = test_Y
sub_df.to_csv('./submission.csv',index=False)

In [15]:
train = pd.concat([train_X,train_Y],axis=1)

In [16]:
from pycaret.classification import *
from pycaret.classification import ClassificationExperiment
s = setup(train, target = 'Survived', session_id = 123)
exp = ClassificationExperiment()
exp.setup(train, target = 'Survived', session_id = 123)
best = compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 6)"
4,Transformed data shape,"(891, 6)"
5,Transformed train set shape,"(623, 6)"
6,Transformed test set shape,"(268, 6)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 6)"
4,Transformed data shape,"(891, 6)"
5,Transformed train set shape,"(623, 6)"
6,Transformed test set shape,"(268, 6)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.7126,0.7326,0.5357,0.6568,0.5816,0.3689,0.3786,0.02
xgboost,Extreme Gradient Boosting,0.7047,0.7316,0.5024,0.6531,0.5583,0.3458,0.3571,0.009
catboost,CatBoost Classifier,0.7047,0.7556,0.4736,0.6676,0.5431,0.3385,0.3538,0.364
lightgbm,Light Gradient Boosting Machine,0.7015,0.7385,0.4902,0.6568,0.5503,0.3374,0.3505,82.5
et,Extra Trees Classifier,0.6982,0.7054,0.4899,0.6459,0.5448,0.3297,0.3428,0.021
gbc,Gradient Boosting Classifier,0.6951,0.7433,0.4942,0.6319,0.5518,0.327,0.3341,0.014
dt,Decision Tree Classifier,0.6934,0.6878,0.4645,0.6434,0.5339,0.3158,0.328,0.077
ada,Ada Boost Classifier,0.6885,0.7293,0.4696,0.6242,0.5253,0.306,0.3166,0.011
lda,Linear Discriminant Analysis,0.6744,0.6992,0.415,0.6492,0.4866,0.2672,0.2912,0.004
ridge,Ridge Classifier,0.6728,0.0,0.4107,0.6476,0.4834,0.2631,0.2872,0.005


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

In [17]:
exp.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.7126,0.7326,0.5357,0.6568,0.5816,0.3689,0.3786,0.021
xgboost,Extreme Gradient Boosting,0.7047,0.7316,0.5024,0.6531,0.5583,0.3458,0.3571,0.01
catboost,CatBoost Classifier,0.7047,0.7556,0.4736,0.6676,0.5431,0.3385,0.3538,0.009
lightgbm,Light Gradient Boosting Machine,0.7015,0.7385,0.4902,0.6568,0.5503,0.3374,0.3505,0.071
et,Extra Trees Classifier,0.6982,0.7054,0.4899,0.6459,0.5448,0.3297,0.3428,0.021
gbc,Gradient Boosting Classifier,0.6951,0.7433,0.4942,0.6319,0.5518,0.327,0.3341,0.012
dt,Decision Tree Classifier,0.6934,0.6878,0.4645,0.6434,0.5339,0.3158,0.328,0.004
ada,Ada Boost Classifier,0.6885,0.7293,0.4696,0.6242,0.5253,0.306,0.3166,0.013
lda,Linear Discriminant Analysis,0.6744,0.6992,0.415,0.6492,0.4866,0.2672,0.2912,0.004
ridge,Ridge Classifier,0.6728,0.0,0.4107,0.6476,0.4834,0.2631,0.2872,0.005


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

In [18]:
holdout_pred = predict_model(best,data = test_X)

In [22]:
sub_df['Survived'] = holdout_pred['prediction_label']
sub_df.to_csv('./submission.csv',index=False)

In [23]:
!kaggle competitions submit -c titanic -f submission.csv -m "Message"

100%|██████████████████████████████████████| 2.77k/2.77k [00:01<00:00, 1.56kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster