# Higgs Boson Machine Learning Challenge

In [1]:
# !unzip training.zip
# !unzip test.zip
# !unzip random_submission.zip

## 1. Import Library

In [2]:
import pandas as pd
import numpy as np

import os
import random

# scikit-learn: Machine learning(ML)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier
from sklearn.utils import check_random_state

### 1-1. Seed Fixed

In [3]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    check_random_state(seed)

seed_everything(42)

## 2. Load Dataset

In [4]:
def get_dataset():
    """
    데이터셋 함수
    """
    # training data 불러오기
    train_df = pd.read_csv('training.csv')

    # s: 1, b: 0으로 변환
    train_df['Label'] = train_df['Label'].apply(lambda x: 1 if x=='s' else 0)

    # X, y(Label), W(Weight) 구분
    X = train_df.iloc[:, 1:31].values
    y = train_df['Label'].values
    W = train_df['Weight'].values

    # test data 불러오기
    test_df = pd.read_csv('test.csv')
    test_df.head()

    # test_ids(test_df의 EventId), X_test 구분
    test_ids = test_df.iloc[:, 0]
    X_test = test_df.iloc[:, 1:31].values

    return X, y, W, test_ids, X_test

In [5]:
X, y, W, test_ids, X_test = get_dataset()

## 3. Data Preprocessing

In [6]:
def preprocess(X, X_test):
    """
    데이터 전처리 함수
      1) 결측치 처리
      2) 역로그 변수 생성
      3) 데이터 스케일 조정

      - X: 훈련 데이터 Features
      - X_test: 테스트 데이터 Features
    """
    # 결측값 처리: -999.0을 열에서 가장 빈도가 높은 값으로 대체
    imputer = SimpleImputer(missing_values=-999.0, strategy='most_frequent')
    X = imputer.fit_transform(X)
    X_test = imputer.transform(X_test)

    # 양수인 feature에 대해 역로그 만들기
    inv_log_cols = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 16, 19, 21, 23, 26]
    X_inv_log_cols = np.log(1 / (1 + X[:, inv_log_cols]))
    X = np.hstack((X, X_inv_log_cols))

    X_test_inv_log_cols = np.log(1 / (1 + X_test[:, inv_log_cols]))
    X_test = np.hstack((X_test, X_test_inv_log_cols))

    # 데이터 스케일 조정: 표준화
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_test = scaler.transform(X_test)

    return X, X_test

In [7]:
X, X_test = preprocess(X, X_test)

In [8]:
# 가중치 재조정: X_test.shape[0]=550000 이므로 가중치를 재조정함
W *= 550000 / len(y)

## 4. Model
- 평가지표: approximate median significance (AMS)

### 4-1. Model Train

In [9]:
# 모델 정의
clf = AdaBoostClassifier(estimator=ExtraTreesClassifier(n_estimators=500,
                                                        max_features=25,
                                                        max_depth=13,
                                                        min_samples_leaf=100,
                                                        min_samples_split=100,
                                                        verbose=1,
                                                        n_jobs=-1),
                         n_estimators=20, # 반복수 또는 base_estimator 개수
                         learning_rate=0.7, 
                         random_state=42)

In [10]:
# 모델 학습
clf.fit(X, y, sample_weight=W)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.3min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    4.8s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    5.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.3min 

[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.8min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    3.9s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    4.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.9min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:  

### 4-2. Model Prediction

In [11]:
y_pred = clf.predict_proba(X)[:,1]
y_test_pred = clf.predict_proba(X_test)[:,1]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    4.6s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    5.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    4.8s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    5.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    4.6s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    5.3s finished
[

[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   10.7s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   12.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    4.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   10.3s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   11.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    4.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   10.2s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   11.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    4.1s
[

In [12]:
def get_AMS_score(W, Y, Y_pred):
    """
    ams score 계산 함수
      - W: Weight
      - Y: 실제 y 값
      - Y_pred: 예측 y 값
    """
    s = W.dot(np.logical_and(Y_pred==1, Y==1))
    b = W.dot(np.logical_and(Y_pred==1, Y==0))

    br = 10.0
    return np.sqrt(2 * ((s + b + br) * np.log(1.0 + s / (b + br)) - s))

def check_threshold(W, Y, Y_pred):
    """
    임계값(threshold)을 찾는 함수
      - W: Weight
      - Y: 실제 y 값
      - Y_pred: 예측 y 값
    """
    best = 0
    tmp = np.copy(Y_pred)
    for threshold in np.arange(0, 100, 1):
        cut = np.percentile(y_test_pred, threshold)
        threshold_y_pred = np.where(tmp > cut, 1, 0)
        if get_AMS_score(W, Y, threshold_y_pred) > best:
            best = get_AMS_score(W, Y, threshold_y_pred)
            best_threshold = threshold
    return {'AMS': best, 'best_threshold': best_threshold}

In [13]:
clf_score = check_threshold(W, y, y_pred)
print(clf_score)

{'AMS': 17.107508393295042, 'best_threshold': 94}


In [14]:
# threshold = clf_score['best_threshold']
# Public Score는 best_threshold인 94일 때보다 83일 때 가장 좋은 성능을 보여 threshold를 83으로 지정함
# 결과적으로 Private Score는 threshold가 86일 때 가장 좋은 성능을 보임
threshold = 83
cut = np.percentile(y_test_pred, threshold)
print(cut)

threshold_y_test_pred = y_test_pred > cut

0.43719743137212974


In [15]:
ids_probs = np.transpose(np.vstack((test_ids, y_test_pred)))
ids_probs = np.array(sorted(ids_probs, key=lambda x: -x[1]))
ids_probs_ranks = np.hstack((ids_probs,
                             np.arange(1, ids_probs.shape[0] + 1).reshape((ids_probs.shape[0], 1))))

test_ids_map = {}
for test_id, prob, rank in ids_probs_ranks:
    test_id = int(test_id)
    rank = int(rank)
    test_ids_map[test_id] = rank

y_event_id, y_rank_order, y_class = [], [], []
for i, pred in enumerate(threshold_y_test_pred):
    event_id = int(test_ids[i])
    rank_order = test_ids_map[test_ids[i]]
    class_id = pred and 's' or 'b'

    y_event_id.append(event_id)
    y_rank_order.append(rank_order)
    y_class.append(class_id)

## 5. Submission

In [16]:
submission = pd.read_csv('random_submission.csv')
submission['EventId'] = y_event_id
submission['RankOrder'] = y_rank_order
submission['Class'] = y_class
submission.head()

Unnamed: 0,EventId,RankOrder,Class
0,350000,536582,b
1,350001,334761,b
2,350002,171797,b
3,350003,70174,s
4,350004,454311,b


In [17]:
submission.to_csv('submission.csv', index=False)