# Data Import

In [None]:
!pip install scikit-learn==0.23.2

In [None]:
import pandas as pd
import numpy as np
import os

import matplotlib
import matplotlib.pyplot as plt
import graphviz
import seaborn as sns

print("Version Pandas", pd.__version__)
print("Version Matplotlib", matplotlib.__version__)
print("Version Numpy", np.__version__)
print("Version Seaborn", sns.__version__)

os.listdir('../input/tabular-playground-series-apr-2021/')

In [None]:
BASE_DIR = '../input/tabular-playground-series-apr-2021/'
train_df = pd.read_csv(BASE_DIR + 'train.csv')
test_df = pd.read_csv(BASE_DIR + 'test.csv')
sample_submission = pd.read_csv(BASE_DIR + 'sample_submission.csv')

# reset_index: setting an index; drop=True: delete an index
all_df=pd.concat([train_df, test_df]).reset_index(drop=True)

print('Rows and Columns in train dataset:', train_df.shape)
print('Rows and Columns in test dataset:', test_df.shape)

In [None]:
import random


from sklearn.metrics import accuracy_score # 분류 평가지표
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

# modeling
import lightgbm as lgb
import catboost as ctb
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz

import warnings
warnings.simplefilter('ignore')

In [None]:
TARGET = 'Survived'

# parameters
N_ESTIMATORS= 1000
N_SPLITS =10
SEED = 2021
EARLY_STOPPING_ROUNDS= 100
VERBOSE= 100

In [None]:
# Create random seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED']= str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Data Preprocessing

### 1) Null values

In [None]:
print('Missing values per columns in train dataset')
for col in train_df.columns:
    temp_col=train_df[col].isnull().sum()
    print(f'{col}: {temp_col}')
print()
print('Missing values per columns in test dataset')
for col in test_df.columns:
    temp_col= test_df[col].isnull().sum()
    print(f'{col}: {temp_col}')

### 2) Filling missing values

In [None]:
# Age : Age의 평균값 -> NA
all_df['Age'] = all_df['Age'].fillna(all_df['Age'].mean())

# Cabin: Cabin의 문자열 중 가장 첫번째 알파벳만 추출
# strip(): string에서 공백 제거
all_df['Cabin']=all_df['Cabin'].fillna('X').map(lambda x: x[0].strip())

# Ticket: NA <- 'X', 문자열 중 첫번째 단어만 추출
all_df['Ticket']= all_df['Ticket'].fillna('X').map(lambda x: str(x).split()[0] 
                                                   if len(str(x).split()) > 1
                                                  else 'X')

#Pclass: Pclass에 따른 Fare의 평균 -> fare_map
fare_map= all_df[['Fare','Pclass']].dropna().groupby('Pclass').median().to_dict()
all_df['Fare']=all_df['Fare'].fillna(all_df['Pclass'].map(fare_map['Fare']))
all_df['Fare']=np.log1p(all_df['Fare'])

#Embarked: na <- 'X'
all_df['Embarked']=all_df['Embarked'].fillna('X')

#Name: First Name만 추출
all_df['Name']=all_df['Name'].map(lambda x: x.split(',')[0])

In [None]:
# Pclass별로 Ticket 순위
data_1=all_df.loc[all_df['Pclass']==1].groupby('Ticket')['Ticket'].count().sort_values(ascending=False)
print(data_1)
print()
data_2=all_df.loc[all_df['Pclass']==2].groupby('Ticket')['Ticket'].count().sort_values(ascending=False)
print(data_2)
print()
data_3=all_df.loc[all_df['Pclass']==3].groupby('Ticket')['Ticket'].count().sort_values(ascending=False)
print(data_3)
print()

## Encoding

In [None]:
label_cols = ['Name', 'Ticket', 'Sex','Pclass','Embarked']
onehot_cols = [ 'Cabin',]
numerical_cols = [ 'Age', 'SibSp', 'Parch', 'Fare']

In [None]:
# Label Encoding 함수
def label_encoder(c):
    le = LabelEncoder()
    return le.fit_transform(c)

In [None]:
# StandardScaler(): 평균을 제거하고 데이터를 단위 분산으로 조정, but 이상치가 있다면 데이터의 확산 달라짐
scaler= StandardScaler()

onehot_encoded_df=pd.get_dummies(all_df[onehot_cols])
label_encoded_df=all_df[label_cols].apply(label_encoder)
numerical_df=pd.DataFrame(scaler.fit_transform(all_df[numerical_cols]), columns= numerical_cols)
target_df=all_df[TARGET]

all_df=pd.concat([numerical_df, label_encoded_df, onehot_encoded_df, target_df], axis=1)

## Data split

In [None]:
drop_list=['Survived','Parch']

In [None]:
train = all_df.iloc[:100000, :] # 100,000개 까지
test = all_df.iloc[100000:, :]

# test set에서는 survived 인 종속변수만 drop한다. 
test= test.drop('Survived', axis=1)
model_results= pd.DataFrame()
folds= 5

In [None]:
test.head()

In [None]:
y= train.loc[:, 'Survived']
X= train.drop(drop_list, axis=1)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.25, random_state=21)

# **Scikit Learn**
> 

# **Pycaret**
> *references*\
> [https://www.kaggle.com/j2hoon85/tps-april-sklearn-pycaret-for-newbies#PyCaret](https://www.kaggle.com/j2hoon85/tps-april-sklearn-pycaret-for-newbies#PyCaret)\
> [https://www.kaggle.com/subinium/how-to-use-pycaret-with-feature-engineering](https://www.kaggle.com/subinium/how-to-use-pycaret-with-feature-engineering)

> Scores
> - Accuracy, AUC, Recall, Prec, F1-score
> - AUC, F1-score를 함께 보도록 한다.

In [None]:
#!pip install pycaret==2.2.3

In [None]:
!pip install pycaret

In [None]:
from pycaret.utils import version
import sklearn
print("pycaret version:", version())
print("sklearn version:", sklearn.__version__)

In [None]:
all_df_pycaret= pd.concat([X,y], axis=1)
all_df_pycaret['Survived']= all_df_pycaret['Survived'].astype('int64')
all_df_pycaret.info()

## 1. Model Training

### 1-1. Compare Model

In [None]:
from pycaret.classification import *

category_caret={'Sex':['0','1'],'Pclass':['0','1','2'], 'Embarked':['0','1','2','3']}

setup(data = all_df_pycaret, 
      target = 'Survived',
      ordinal_features= category_caret,
      #numeric_imputation = 'Age','SibSp','Name','Ticket','Fare',
      fold=5,
      silent = True,
      session_id=1,
      #data_split_shuffle=True
      fold_shuffle=True

     )
#set_config('seed',SEED)

In [None]:
#best_model = compare_models(sort = 'Accuracy', n_select = 4)

In [None]:
#print(best_model)

### 1-2. Create Model
- Top4 모델: gbc / lightgbm / catboost / xgboost

In [None]:
gbc= create_model('gbc')

In [None]:
lgbm= create_model('lightgbm')

In [None]:
cb=create_model('catboost')

In [None]:
#xgb= create_model('xgboost')

### 1-3. Tune Model
- tune_model()을 써서 튜닝된 하이퍼 파라미터들을 확인해보겠습니다. 

In [None]:
tuned_gbc= tune_model(gbc)

In [None]:
tuned_lgbm= tune_model(lgbm)

In [None]:
tuned_cb=tune_model(cb)

In [None]:
#tuned_xgb= tune_model(xgb)

In [None]:
print(tuned_gbc)
print(tuned_lgbm)
print(tuned_cb)
#print(tuned_xgb)

In [None]:
plot_model(tuned_gbc)

In [None]:
plot_model(tuned_lgbm)

In [None]:
plot_model(tuned_cb)

## 2. Model Ensembling

### 2-1. Blend Models

In [None]:
# train a voting classifier : soft
blended_soft= blend_models(estimator_list=[tuned_gbc, lgbm, tuned_cb], method='soft')

In [None]:
# train a voting classifier : hard
blended_hard= blend_models(estimator_list=[tuned_gbc,tuned_cb], method='hard')

### 2-2. Stack Models

In [None]:
stacked_models=stack_models(estimator_list=[tuned_gbc,lgbm, tuned_cb])

### 2-3.

### Predictions and Submissions

In [None]:
pred= predict_model(stacked_models, data= test)
pred.info()

In [None]:
pred['Label']

In [None]:
sample_submission['Survived']

In [None]:
pred = pred.reset_index()

In [None]:
pred['Label']

In [None]:
#sample_submission = pd.read_csv(BASE_DIR + 'sample_submission.csv')
sample_submission['Survived']= pred['Label']
sample_submission

In [None]:
sample_submission.to_csv('Pycaret Submission.csv', index=False)
sample_submission.head()