In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/porto-seguro-safe-driver-prediction/train.csv')
test = pd.read_csv('../input/porto-seguro-safe-driver-prediction/test.csv')
train.info()

In [None]:
train.head()

In [None]:
train.isna().sum()

In [None]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis=1)

all_features = all_data.columns

In [None]:
all_data

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_features = [feature for feature in all_features if 'cat' in feature]

onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features]) 

all_data

In [None]:
num_train = len(train) # 훈련 데이터 개수

# 훈련 데이터와 테스트 데이
X = all_data[:num_train]
X_test = all_data[num_train:]

y = train['target'].values

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2, 
                                                      random_state=0)


In [None]:
X.info()

In [None]:
eval_set=[(X_valid, y_valid)]

def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true) # numba가 이해할 수 있는 형식으로 변환
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    # 해당 커널에서 분류에 사용한 XGBClassifier모델은 평가 지표로 rmse와 같은 값을 사용 -> 즉 오류의 최솟값을 찾음
    # 이 대회의 평가 지표인 지니 계수 : 0.5에 가까울수록(값이 클수록) 좋은 값이기 때문에 -를 붙여주는 함수를 생성
    return [("gini", gini_score)]

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(    
                        n_estimators=400,
                        max_depth=17,
                        objective="binary:logistic",
                        learning_rate=0.05, 
                        subsample=.8,
                        min_child_weight=6,
                        colsample_bytree=.8,
                        scale_pos_weight=1.6,
                        gamma=10,
                        reg_alpha=8,
                        reg_lambda=1.3,
                     )

model.fit(X_train, y_train,
          eval_set=eval_set,
          eval_metric=gini_xgb,
          early_stopping_rounds=7,
          verbose=True)

In [None]:
from sklearn.metrics import accuracy_score

y_pred=model.predict(X_train)
print(accuracy_score(y_train, y_pred))

In [None]:
sub=pd.read_csv("../input/porto-seguro-safe-driver-prediction/sample_submission.csv")
sub

In [None]:
y_test_pred = model.predict(X_test)

sub['target']= y_test_pred
sub.to_csv('xgb_submit.csv', index=False)