# 준비

In [0]:
import sys
IN_COLAB = 'google.colab' in sys.modules

## 패키지 설치

In [2]:
!pip install pandas numpy tqdm==4.43.0 bayesian-optimization lightgbm



## 라이브러리 임포트

In [0]:
import pandas as pd                         # 데이터 분석 라이브러리
import numpy as np                          # 계산 라이브러리
from tqdm.auto import tqdm                  # 진행바
from sklearn.metrics import roc_auc_score   # AUC 스코어 계산
from sklearn.model_selection import KFold   # K-fold CV    
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리  
from functools import partial               # 함수 변수 고정
import lightgbm as lgb                      # LightGBM 라이브러리
import gc
import warnings                             
warnings.filterwarnings("ignore")           # 경고 문구 미표시
tqdm.pandas()

## 상수 정의

In [0]:
DATA_DIR = 'data'

## Colab 설정

In [5]:
if IN_COLAB:
    DRIVE_DIR = '/content/drive'
    from google.colab import drive
    drive.mount(DRIVE_DIR)

    import os
    os.symlink(f'{DRIVE_DIR}/My Drive/data', DATA_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


FileExistsError: ignored

# 데이터 샘플링

In [0]:
train.to_feather('data/train.feather')

In [0]:
train = pd.read_csv('data/train.csv')

In [0]:
train = pd.read_feather('data/train.feather')

In [0]:
train.head()

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0.0,0,T,Camera,"at (145.25, 21.5078125)"
1,0,1,0.0,1,T,Camera,"at (22.75, 147.0078125)"
2,0,1,0.02,0,T,Selection,['OrbitalCommand [3080001]']
3,0,1,0.02,0,T,Ability,(1360) - TrainSCV
4,0,1,0.14,0,T,Camera,"at (142.99609375, 24.50390625)"


In [0]:
train.shape

(67091776, 7)

In [0]:
len(train['game_id'].unique())

38872

In [0]:
def sample_data(df, n_games, seed=None):
    if seed is not None:
        np.random.seed(seed)

    game_ids = df['game_id'].unique()
    sampled_game_ids = np.random.choice(game_ids, size=n_games, replace=False)

    return df[df['game_id'].isin(sampled_game_ids)]

In [0]:
sampled_train = sample_data(train, n_games=300, seed=0)

In [0]:
sampled_train.to_csv('data/sampled_train.csv', index=False)

# 특징 추출

In [0]:
train = pd.read_csv('data/sampled_train.csv')

## 승리자 추출

In [0]:
def extract_winner(df):
    return df.groupby(['game_id'])['winner'].first()

## 종족 추출

In [0]:
def species_converter(string):
    if string == 'T':
        return 0
    elif string == 'P':
        return 1
    elif string == 'Z':
        return 2
    else:
        raise ValueError

def extract_species(df):
    species = df.groupby(['game_id', 'player'])['species'].first()

    species_df = species.unstack(level=-1)
    species_df.columns = species_df.columns.map(lambda x: f'p{x}_species')
    species_df.columns.name = None

    species_df = species_df.applymap(species_converter)

    return species_df

## 플레이어별 이벤트 횟수

In [0]:
def extract_event_counts(df):
    events = ['Ability', 'AddToControlGroup', 'Camera', 'ControlGroup', 'GetControlGroup', 'Right Click', 'Selection', 'SetControlGroup']

    event_counts = df.groupby(['game_id', 'player'])['event'].value_counts()
    event_counts = event_counts.unstack(level=-1).unstack(level=-1)
    event_counts.columns = event_counts.columns.map(lambda x: f'p{x[1]}_event_{x[0]}')
    event_counts = event_counts.fillna(0)

    result = pd.DataFrame(index=event_counts.index)

    for player in ['p0', 'p1']:
        for event in events:
            result[f'{player}_event_{event}'] = event_counts.get(f'{player}_event_{event}', 0.0)

    for event in events:
        result[f'delta_event_{event}'] = result[f'p0_event_{event}'] - result[f'p1_event_{event}']

    return result

## 게임 시간 추출

In [0]:
def extract_playtime(df):
    def min_to_sec(t):
        m = int(t)
        s = (t - m) * 100
        return (m * 60) + s

    return df.groupby(['game_id'])['time'].max().apply(min_to_sec)

## 시뮬레이션 후 특징 추출

In [0]:
class GameState:
    def init(self):
        pass

    def update(self, game_id, time, player, species, event, event_contents):
        pass

    def to_dict(self):
        return {}


class GameStateManager:
    def __init__(self):
        self._states = []

    def add(self, game_state):
        self._states.append(game_state)

    def init(self):
        for state in self._states:
            state.init()

    def update(self, game_id, time, player, species, event, event_contents):
        for state in self._states:
            state.update(game_id, time, player, species, event, event_contents)

    def to_dict(self):
        ret = {}
        for state in self._states:
            ret.update(state.to_dict())
        return ret


def extract_game_states(df, game_states):
    mat = df.to_numpy()

    data = {}

    cur_game_id = -1

    game_state = GameStateManager()
    for state in game_states:
        game_state.add(state)

    for row in tqdm(mat):
        game_id, time, player, species, event, event_contents = row

        if game_id != cur_game_id:
            if cur_game_id != -1:
                data[cur_game_id] = game_state.to_dict()

            cur_game_id = game_id
            game_state.init()

        game_state.update(game_id, time, player, species, event, event_contents)

    if cur_game_id != -1:
        data[cur_game_id] = game_state.to_dict()

    del mat
    gc.collect()

    return pd.DataFrame.from_dict(data, orient='index')

### 카메라 분산

In [0]:
class CameraState(GameState):
    def init(self):
        self.p0_camera_x = []
        self.p0_camera_y = []
        self.p1_camera_x = []
        self.p1_camera_y = []

    def update(self, game_id, time, player, species, event, event_contents):
        if event == 'Camera':
            camera_x, camera_y = CameraState.parse_at(event_contents)

            if player == 0:
                self.p0_camera_x.append(camera_x)
                self.p0_camera_y.append(camera_y)
            else:
                self.p1_camera_x.append(camera_x)
                self.p1_camera_y.append(camera_y)

    def to_dict(self):
        return {
            'p0_camera_x_var': np.var(self.p0_camera_x),
            'p0_camera_y_var': np.var(self.p0_camera_y),
            'p1_camera_x_var': np.var(self.p1_camera_x),
            'p1_camera_y_var': np.var(self.p1_camera_y),
        }

    @staticmethod
    def parse_at(event_contents):
        mid = event_contents.find(',', 4)
        x = float(event_contents[4:mid])
        y = float(event_contents[mid+2:-1])
        return x, y

In [0]:
df = train.drop(columns=['winner'])

In [16]:
extract_game_states(df, [
    CameraState(),
])

HBox(children=(FloatProgress(value=0.0, max=67091776.0), HTML(value='')))




Unnamed: 0,p0_camera_x,p0_camera_y,p1_camera_x,p1_camera_y
0,763.434578,1029.515414,24.314657,78.746863
1,671.434895,837.446806,1412.369713,2348.367561
2,1493.570080,628.467848,1435.172816,550.632165
3,1327.289239,1272.379892,1433.182454,1184.508068
4,1457.014632,1279.786625,650.662409,754.768815
...,...,...,...,...
38867,1246.874820,1214.428014,1914.465605,1433.958132
38868,1014.613696,2252.216101,596.899852,1344.258487
38869,1713.881894,721.842525,2348.298356,1414.454222
38870,6.834018,137.199129,668.822659,668.614059


## 실험

In [0]:
df = train

## Feature Engineering

In [0]:
def prepare_x_data(df):
    df = df.drop(columns=['winner'])
    features = []
    features.append(extract_playtime(df))
    features.append(extract_species(df))
    features.append(extract_event_counts(df))
    features.append(extract_game_states(df, [
        CameraState(),
    ]))

    return pd.concat(features, axis=1)

In [0]:
def prepare_y_data(df):
    winners = extract_winner(df)
    return np.array(winners)

In [44]:
x_train = prepare_x_data(train)
y_train = prepare_y_data(train)
x_train.head()

HBox(children=(FloatProgress(value=0.0, max=520835.0), HTML(value='')))




Unnamed: 0,time,p0_species,p1_species,p0_event_Ability,p0_event_AddToControlGroup,p0_event_Camera,p0_event_ControlGroup,p0_event_GetControlGroup,p0_event_Right Click,p0_event_Selection,p0_event_SetControlGroup,p1_event_Ability,p1_event_AddToControlGroup,p1_event_Camera,p1_event_ControlGroup,p1_event_GetControlGroup,p1_event_Right Click,p1_event_Selection,p1_event_SetControlGroup,delta_event_Ability,delta_event_AddToControlGroup,delta_event_Camera,delta_event_ControlGroup,delta_event_GetControlGroup,delta_event_Right Click,delta_event_Selection,delta_event_SetControlGroup,p0_camera_x_var,p0_camera_y_var,p1_camera_x_var,p1_camera_y_var
18,295.0,2,2,26.0,0.0,228.0,0.0,47.0,68.0,103.0,5.0,42.0,0.0,82.0,0.0,41.0,126.0,95.0,7.0,-16.0,0.0,146.0,0.0,6.0,-58.0,8.0,-2.0,768.272004,921.704322,1785.817478,1072.86221
759,332.0,1,2,35.0,0.0,375.0,0.0,191.0,118.0,95.0,7.0,25.0,0.0,311.0,0.0,167.0,71.0,281.0,1.0,10.0,0.0,64.0,0.0,24.0,47.0,-186.0,6.0,1971.391818,1514.360742,1937.170697,1055.699525
937,463.0,2,1,52.0,0.0,434.0,0.0,49.0,209.0,184.0,1.0,56.0,0.0,205.0,0.0,174.0,290.0,78.0,4.0,-4.0,0.0,229.0,0.0,-125.0,-81.0,106.0,-3.0,1590.372118,1108.987084,3023.422341,1418.252719
954,573.0,0,1,26.0,0.0,265.0,0.0,14.0,156.0,69.0,0.0,57.0,0.0,483.0,0.0,0.0,157.0,59.0,0.0,-31.0,0.0,-218.0,0.0,14.0,-1.0,10.0,0.0,46.170852,216.606557,1443.99294,1242.162422
1044,599.0,2,1,64.0,6.0,344.0,0.0,88.0,268.0,167.0,4.0,46.0,0.0,899.0,0.0,73.0,185.0,85.0,7.0,18.0,6.0,-555.0,0.0,15.0,83.0,82.0,-3.0,1248.83247,897.182721,996.457435,562.700024


# 변수 선택 및 모델 구축

In [45]:
# train = pd.read_csv('data/train.csv')
train = pd.read_csv('data/sampled_train.csv')
x_train = prepare_x_data(train)
y_train = prepare_y_data(train)
x_train.head()

HBox(children=(FloatProgress(value=0.0, max=520835.0), HTML(value='')))




Unnamed: 0,time,p0_species,p1_species,p0_event_Ability,p0_event_AddToControlGroup,p0_event_Camera,p0_event_ControlGroup,p0_event_GetControlGroup,p0_event_Right Click,p0_event_Selection,p0_event_SetControlGroup,p1_event_Ability,p1_event_AddToControlGroup,p1_event_Camera,p1_event_ControlGroup,p1_event_GetControlGroup,p1_event_Right Click,p1_event_Selection,p1_event_SetControlGroup,delta_event_Ability,delta_event_AddToControlGroup,delta_event_Camera,delta_event_ControlGroup,delta_event_GetControlGroup,delta_event_Right Click,delta_event_Selection,delta_event_SetControlGroup,p0_camera_x_var,p0_camera_y_var,p1_camera_x_var,p1_camera_y_var
18,295.0,2,2,26.0,0.0,228.0,0.0,47.0,68.0,103.0,5.0,42.0,0.0,82.0,0.0,41.0,126.0,95.0,7.0,-16.0,0.0,146.0,0.0,6.0,-58.0,8.0,-2.0,768.272004,921.704322,1785.817478,1072.86221
759,332.0,1,2,35.0,0.0,375.0,0.0,191.0,118.0,95.0,7.0,25.0,0.0,311.0,0.0,167.0,71.0,281.0,1.0,10.0,0.0,64.0,0.0,24.0,47.0,-186.0,6.0,1971.391818,1514.360742,1937.170697,1055.699525
937,463.0,2,1,52.0,0.0,434.0,0.0,49.0,209.0,184.0,1.0,56.0,0.0,205.0,0.0,174.0,290.0,78.0,4.0,-4.0,0.0,229.0,0.0,-125.0,-81.0,106.0,-3.0,1590.372118,1108.987084,3023.422341,1418.252719
954,573.0,0,1,26.0,0.0,265.0,0.0,14.0,156.0,69.0,0.0,57.0,0.0,483.0,0.0,0.0,157.0,59.0,0.0,-31.0,0.0,-218.0,0.0,14.0,-1.0,10.0,0.0,46.170852,216.606557,1443.99294,1242.162422
1044,599.0,2,1,64.0,6.0,344.0,0.0,88.0,268.0,167.0,4.0,46.0,0.0,899.0,0.0,73.0,185.0,85.0,7.0,18.0,6.0,-555.0,0.0,15.0,83.0,82.0,-3.0,1248.83247,897.182721,996.457435,562.700024


In [0]:
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda, bagging_fraction, feature_fraction, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    kf = KFold(n_splits=n_splits)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = lgb.LGBMClassifier(
            num_leaves = int(num_leaves), 
            learning_rate = learning_rate, 
            n_estimators = int(n_estimators), 
            subsample = np.clip(subsample, 0, 1), 
            colsample_bytree = np.clip(colsample_bytree, 0, 1), 
            reg_alpha = reg_alpha, 
            reg_lambda = reg_lambda,
            bagging_fraction = bagging_fraction,
            feature_fraction = feature_fraction,
        )
        
        model.fit(x_train, y_train)
        models.append(model)
        
        pred = model.predict_proba(x_valid)[:, 1]
        true = y_valid
        score += roc_auc_score(true, pred)/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [47]:
# 모델과 관련없는 변수 고정
func_fixed = partial(lgb_cv, x_data=x_train, y_data=y_train, n_splits=5, output='score') 
# 베이지안 최적화 범위 설정
lgbBO = BayesianOptimization(
    func_fixed, 
    {
        'num_leaves': (16, 128),        # num_leaves,       범위(16~1024)
        'learning_rate': (0.0001, 0.1),  # learning_rate,    범위(0.0001~0.1)
        'n_estimators': (16, 1024),      # n_estimators,     범위(16~1024)
        'subsample': (0, 0.2),             # subsample,        범위(0~1)
        'colsample_bytree': (0, 1),      # colsample_bytree, 범위(0~1)
        'reg_alpha': (0, 10),            # reg_alpha,        범위(0~10)
        'reg_lambda': (0, 50),           # reg_lambda,       범위(0~50)
        'bagging_fraction': (0.1, 1.0),
        'feature_fraction': (0.1, 1.0),
    }, 
    random_state=4321                    # 시드 고정
)
lgbBO.maximize(init_points=5, n_iter=30) # 처음 5회 랜덤 값으로 score 계산 후 30회 최적화

# 이 예제에서는 7개 하이퍼 파라미터에 대해 30회 조정을 시도했습니다.
# 다양한 하이퍼 파라미터, 더 많은 iteration을 시도하여 최상의 모델을 얻어보세요!
# LightGBM Classifier: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html

|   iter    |  target   | baggin... | colsam... | featur... | learni... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6382  [0m | [0m 0.1637  [0m | [0m 0.8151  [0m | [0m 0.7911  [0m | [0m 0.02871 [0m | [0m 210.6   [0m | [0m 125.6   [0m | [0m 4.062   [0m | [0m 37.89   [0m | [0m 0.01783 [0m |
| [95m 2       [0m | [95m 0.6576  [0m | [95m 0.3789  [0m | [95m 0.6189  [0m | [95m 0.5139  [0m | [95m 0.02191 [0m | [95m 684.8   [0m | [95m 92.01   [0m | [95m 9.503   [0m | [95m 14.06   [0m | [95m 0.124   [0m |
| [0m 3       [0m | [0m 0.6418  [0m | [0m 0.445   [0m | [0m 0.4004  [0m | [0m 0.9484  [0m | [0m 0.093   [0m | [0m 972.0   [0m | [0m 58.05   [0m | [0m 3.423   [0m | [0m 33.24   [0m | [0m 0.008464[0m |
| [0m 4       [0m | [0m 0.6142  [0m | [0m 0.309   

# 모델 학습 및 검증

In [0]:
params = lgbBO.max['params']
models = lgb_cv(
    params['num_leaves'], 
    params['learning_rate'], 
    params['n_estimators'], 
    params['subsample'], 
    params['colsample_bytree'], 
    params['reg_alpha'], 
    params['reg_lambda'], 
    x_data=x_train, y_data=y_train, n_splits=5, output='model')

In [0]:
test = pd.read_csv('data/test.csv')
x_test, _ = data_preparation(test, answer=False)

100%|██████████| 16787/16787 [03:47<00:00, 73.67it/s]


In [0]:
preds = []
for model in models:
    pred = model.predict_proba(x_test)[:, 1]
    preds.append(pred)
pred = np.mean(preds, axis=0)

submission = pd.read_csv('data/sample_submission.csv', index_col=0)
submission['winner'] = submission['winner'] + pred
submission.to_csv('mini_submission.csv')
submission.head()

Unnamed: 0_level_0,winner
game_id,Unnamed: 1_level_1
38872,0.545845
38873,0.463037
38874,0.424968
38875,0.384724
38876,0.446188
