# 준비

In [0]:
import sys
IN_COLAB = 'google.colab' in sys.modules

## 패키지 설치

In [2]:
!pip install pandas numpy tqdm==4.43.0 bayesian-optimization lightgbm



## 라이브러리 임포트

In [0]:
import pandas as pd                         # 데이터 분석 라이브러리
import numpy as np                          # 계산 라이브러리
from tqdm.auto import tqdm                  # 진행바
from sklearn.metrics import roc_auc_score   # AUC 스코어 계산
from sklearn.model_selection import KFold   # K-fold CV    
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리  
from functools import partial               # 함수 변수 고정
import lightgbm as lgb                      # LightGBM 라이브러리
import gc
import warnings                             
warnings.filterwarnings("ignore")           # 경고 문구 미표시
tqdm.pandas()

## 상수 정의

In [0]:
DATA_DIR = 'data'

## Colab 설정

In [5]:
if IN_COLAB:
    DRIVE_DIR = '/content/drive'
    from google.colab import drive
    drive.mount(DRIVE_DIR)

    import os
    os.symlink(f'{DRIVE_DIR}/My Drive/data', DATA_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


FileExistsError: ignored

# 데이터 샘플링

In [0]:
train = pd.read_csv('data/train.csv')

In [0]:
train.head()

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0.0,0,T,Camera,"at (145.25, 21.5078125)"
1,0,1,0.0,1,T,Camera,"at (22.75, 147.0078125)"
2,0,1,0.02,0,T,Selection,['OrbitalCommand [3080001]']
3,0,1,0.02,0,T,Ability,(1360) - TrainSCV
4,0,1,0.14,0,T,Camera,"at (142.99609375, 24.50390625)"


In [0]:
train.shape

(67091776, 7)

In [0]:
len(train['game_id'].unique())

38872

In [0]:
def sample_data(df, n_games, seed=None):
    if seed is not None:
        np.random.seed(seed)

    game_ids = df['game_id'].unique()
    sampled_game_ids = np.random.choice(game_ids, size=n_games, replace=False)

    return df[df['game_id'].isin(sampled_game_ids)]

In [0]:
sampled_train = sample_data(train, n_games=300, seed=0)

In [0]:
sampled_train.to_csv('data/sampled_train.csv', index=False)

# 탐색적 자료분석

In [0]:
train = pd.read_csv('data/sampled_train.csv')

## 승리자 추출

In [0]:
def extract_winner(df):
    return df.groupby(['game_id'])['winner'].first()

## 종족 추출

In [0]:
def species_converter(string):
    if string == 'T':
        return 0
    elif string == 'P':
        return 1
    elif string == 'Z':
        return 2
    else:
        raise ValueError

def extract_species(df):
    species = df.groupby(['game_id', 'player'])['species'].first()

    species_df = species.unstack(level=-1)
    species_df.columns = species_df.columns.map(lambda x: f'p{x}_species')
    species_df.columns.name = None

    species_df = species_df.applymap(species_converter)

    return species_df

## 시뮬레이션 후 피쳐뽑기

In [0]:
class GameState:
    def init(self):
        pass

    def update(self, game_id, time, player, species, event, event_contents):
        pass

    def to_dict(self):
        return {}


class GameStateManager:
    def __init__(self):
        self._states = []

    def add(self, game_state):
        self._states.append(game_state)

    def init(self):
        for state in self._states:
            state.init()

    def update(self, game_id, time, player, species, event, event_contents):
        for state in self._states:
            state.update(game_id, time, player, species, event, event_contents)

    def to_dict(self):
        ret = {}
        for state in self._states:
            ret.update(state.to_dict())
        return ret

In [0]:
class CameraState(GameState):
    def init(self):
        self.n_camera = 0

    def update(self, game_id, time, player, species, event, event_contents):
        if event == 'Camera':
            self.n_camera += 1

    def to_dict(self):
        return {'n_camera': self.n_camera}

In [0]:
def extract_game_states(df):
    mat = df.to_numpy()

    data = {}

    cur_game_id = -1

    game_state = GameStateManager()
    game_state.add(CameraState())

    for row in tqdm(mat):
        game_id, time, player, species, event, event_contents = row

        if game_id != cur_game_id:
            if cur_game_id != -1:
                data[cur_game_id] = game_state.to_dict()

            cur_game_id = game_id
            game_state.init()

        game_state.update(game_id, time, player, species, event, event_contents)

    if cur_game_id != -1:
        data[cur_game_id] = game_state.to_dict()

    del mat
    gc.collect()

    return pd.DataFrame.from_dict(data, orient='index')

In [0]:
df = train.drop(columns=['winner'])

In [48]:
extract_game_states(df)

HBox(children=(FloatProgress(value=0.0, max=67091776.0), HTML(value='')))




Unnamed: 0,n_camera
0,869
1,1485
2,1138
3,1101
4,702
...,...
38867,963
38868,1191
38869,1115
38870,706


## 실험

In [0]:
df = train

In [0]:
def extract_camera_counts_v1(df):
    data = {}

    cur_game_id = -1
    game_state = None

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        game_id = row['game_id']

        # Init game_state
        if game_id != cur_game_id:
            if game_state is not None:
                data[cur_game_id] = game_state

            cur_game_id = game_id
            game_state = {
                'n_camera': 0
            }

        if row['event'] == "Camera":
            game_state['n_camera'] += 1
    
    if game_state is not None:
        data[cur_game_id] = game_state
    
    return pd.DataFrame.from_dict(data, orient='index')


def extract_camera_counts_v2(df):
    data = {}

    cur_game_id = -1
    game_state = None

    def process_row(row):
        nonlocal data
        nonlocal cur_game_id
        nonlocal game_state

        game_id = row['game_id']

        # Init game_state
        if game_id != cur_game_id:
            if game_state is not None:
                data[cur_game_id] = game_state

            cur_game_id = game_id
            game_state = {
                'n_camera': 0
            }

        if row['event'] == "Camera":
            game_state['n_camera'] += 1

    df.progress_apply(process_row, axis=1)
    
    if game_state is not None:
        data[cur_game_id] = game_state
    
    return pd.DataFrame.from_dict(data, orient='index')


def extract_camera_counts_v3(df):
    mat = df.to_numpy()

    data = {}

    cur_game_id = -1
    game_state = None

    for row in tqdm(mat):
        game_id, time, player, species, event, event_contents = row

        # Init game_state
        if game_id != cur_game_id:
            if game_state is not None:
                data[cur_game_id] = game_state

            cur_game_id = game_id
            game_state = {
                'n_camera': 0
            }

        if event == "Camera":
            game_state['n_camera'] += 1
    
    if game_state is not None:
        data[cur_game_id] = game_state

    del mat
    gc.collect()

    return pd.DataFrame.from_dict(data, orient='index')


# slower than v3
def extract_camera_counts_v4(df):
    mat = df.to_numpy()

    data = {}

    cur_game_id = -1
    game_state = None


    for row in tqdm(mat):
        row = dict(zip(df.columns, row))

        game_id = row['game_id']

        # Init game_state
        if game_id != cur_game_id:
            if game_state is not None:
                data[cur_game_id] = game_state

            cur_game_id = game_id
            game_state = {
                'n_camera': 0
            }

        if row['event'] == "Camera":
            game_state['n_camera'] += 1
    
    if game_state is not None:
        data[cur_game_id] = game_state

    del mat
    gc.collect()

    return pd.DataFrame.from_dict(data, orient='index')

In [0]:
species = df.groupby(['game_id', 'player'])['species'].first()

In [0]:
extract_species(df)

Unnamed: 0_level_0,p0_species,p1_species
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1
18,2,2
759,1,2
937,2,1
954,0,1
1044,2,1
...,...,...
38645,0,0
38657,0,1
38740,0,2
38812,2,0


In [0]:
species_df.applymap(species_converter)

Unnamed: 0_level_0,p0_species,p1_species
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1
18,2,2
759,1,2
937,2,1
954,0,1
1044,2,1
...,...,...
38645,0,0
38657,0,1
38740,0,2
38812,2,0


In [0]:
species_df.columns = species_df.columns.map(lambda x: f'p{x}_species')

In [0]:
species_df.columns.name = None

In [0]:
species_df

Unnamed: 0_level_0,p0_species,p1_species
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1
18,Z,Z
759,P,Z
937,Z,P
954,T,P
1044,Z,P
...,...,...
38645,T,T
38657,T,P
38740,T,Z
38812,Z,T


In [0]:
event_count = df.groupby(['game_id', 'player']).event.value_counts()

In [0]:
a = event_count.unstack(level=-1).unstack(level=-1)

In [0]:
a.columns = a.columns.map(lambda x: f'p{x[1]}_{x[0]}')

In [0]:
a.columns

Index(['p0_Ability', 'p1_Ability', 'p0_AddToControlGroup',
       'p1_AddToControlGroup', 'p0_Camera', 'p1_Camera', 'p0_ControlGroup',
       'p1_ControlGroup', 'p0_GetControlGroup', 'p1_GetControlGroup',
       'p0_Right Click', 'p1_Right Click', 'p0_Selection', 'p1_Selection',
       'p0_SetControlGroup', 'p1_SetControlGroup'],
      dtype='object')

In [0]:
a.columns = a.columns.get_level_values(0)

In [0]:
species.reorder_levels()

ValueError: ignored

## Feature Engineering

In [0]:
def prepare_x_data(df):
    features = []
    features.append(extract_species(df))

    return pd.concat(features, axis=1)

In [0]:
def prepare_y_data(df):
    winners = extract_winner(df)
    return np.array(winners)

In [0]:
x_train = prepare_x_data(train)
y_train = prepare_y_data(train)
x_train.head()

ValueError: ignored

# 변수 선택 및 모델 구축

In [0]:
def species_converter(string):
    if string == 'T':
        return 0
    elif string == 'P':
        return 1
    elif string == 'Z':
        return 2
    else:
        raise ValueError

def data_preparation(df, answer=False):
    game_ids = df['game_id'].unique()
    events = ['Ability', 'AddToControlGroup', 'Camera', 'ControlGroup', 'GetControlGroup', 'Right Click', 'Selection', 'SetControlGroup']
    unique_event_0, unique_event_1, delta_event = {}, {}, {}
    for event in events:
        unique_event_0['P0_' + event] = 0
        unique_event_1['P1_' + event] = 0
        delta_event['delta_' + event] = 0
        
    species = df.groupby(['game_id', 'player']).species.unique()
    event_count = df.groupby(['game_id', 'player']).event.value_counts()
    if answer:
        winners = df.groupby(['game_id']).winner.max()
    
    x_data, y_data = [], []
    for game_id in tqdm(game_ids):
        df_event_count = event_count[game_id].unstack(level=-1)
        df = pd.DataFrame(species[game_id])
        df = pd.concat([df, df_event_count], axis=1)   
        df = df.fillna(0)
        
        df_P0_species = pd.DataFrame([species_converter(df.loc[0]['species'][0])], columns=['P0_species'])        
        df_P1_species = pd.DataFrame([species_converter(df.loc[1]['species'][0])], columns=['P1_species'])
        df = df.drop(['species'], axis=1)

        df_P0_event = unique_event_0.copy()
        for column in df.columns:
            df_P0_event['P0_' + column] = df.loc[0][column]
        df_P0_event = pd.DataFrame(pd.Series(df_P0_event)).T

        df_P1_event = unique_event_1.copy()
        for column in df.columns:
            df_P1_event['P1_' + column] = df.loc[1][column]
        df_P1_event = pd.DataFrame(pd.Series(df_P1_event)).T
        
        df_delta_event = delta_event.copy()
        for column in df.columns:
            df_delta_event['delta_' + column] = df_P0_event['P0_' + column][0] - df_P1_event['P1_' + column][0]
        df_delta_event = pd.DataFrame(pd.Series(df_delta_event)).T

        out = pd.concat([df_P0_species, df_P0_event, df_P1_species, df_P1_event, df_delta_event], axis=1)
        out.index = [game_id]
        out.index.name = 'game_id'
        
        x_data.append(out)
        if answer:
            y_data.append(winners[game_id])  

    x_data = pd.concat(x_data)
    y_data = np.array(y_data)
    
    return x_data, y_data

In [0]:
# train = pd.read_csv('data/train.csv')
train = pd.read_csv('data/sampled_train.csv')
x_train = prepare_x_data(train)
y_train = prepare_y_data(train)
x_train.head()

Unnamed: 0_level_0,p0_species,p1_species
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1
18,Z,Z
759,P,Z
937,Z,P
954,T,P
1044,Z,P


In [0]:
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    kf = KFold(n_splits=n_splits)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = lgb.LGBMClassifier(
            num_leaves = int(num_leaves), 
            learning_rate = learning_rate, 
            n_estimators = int(n_estimators), 
            subsample = np.clip(subsample, 0, 1), 
            colsample_bytree = np.clip(colsample_bytree, 0, 1), 
            reg_alpha = reg_alpha, 
            reg_lambda = reg_lambda,
        )
        
        model.fit(x_train, y_train)
        models.append(model)
        
        pred = model.predict_proba(x_valid)[:, 1]
        true = y_valid
        score += roc_auc_score(true, pred)/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [0]:
# 모델과 관련없는 변수 고정
func_fixed = partial(lgb_cv, x_data=x_train, y_data=y_train, n_splits=5, output='score') 
# 베이지안 최적화 범위 설정
lgbBO = BayesianOptimization(
    func_fixed, 
    {
        'num_leaves': (16, 1024),        # num_leaves,       범위(16~1024)
        'learning_rate': (0.0001, 0.1),  # learning_rate,    범위(0.0001~0.1)
        'n_estimators': (16, 1024),      # n_estimators,     범위(16~1024)
        'subsample': (0, 0.2),             # subsample,        범위(0~1)
        'colsample_bytree': (0, 1),      # colsample_bytree, 범위(0~1)
        'reg_alpha': (0, 10),            # reg_alpha,        범위(0~10)
        'reg_lambda': (0, 50),           # reg_lambda,       범위(0~50)
    }, 
    random_state=4321                    # 시드 고정
)
lgbBO.maximize(init_points=5, n_iter=30) # 처음 5회 랜덤 값으로 score 계산 후 30회 최적화

# 이 예제에서는 7개 하이퍼 파라미터에 대해 30회 조정을 시도했습니다.
# 다양한 하이퍼 파라미터, 더 많은 iteration을 시도하여 최상의 모델을 얻어보세요!
# LightGBM Classifier: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html

|   iter    |  target   | colsam... | learni... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------


ValueError: ignored

# 모델 학습 및 검증

In [0]:
params = lgbBO.max['params']
models = lgb_cv(
    params['num_leaves'], 
    params['learning_rate'], 
    params['n_estimators'], 
    params['subsample'], 
    params['colsample_bytree'], 
    params['reg_alpha'], 
    params['reg_lambda'], 
    x_data=x_train, y_data=y_train, n_splits=5, output='model')

In [0]:
test = pd.read_csv('data/test.csv')
x_test, _ = data_preparation(test, answer=False)

100%|██████████| 16787/16787 [03:47<00:00, 73.67it/s]


In [0]:
preds = []
for model in models:
    pred = model.predict_proba(x_test)[:, 1]
    preds.append(pred)
pred = np.mean(preds, axis=0)

submission = pd.read_csv('data/sample_submission.csv', index_col=0)
submission['winner'] = submission['winner'] + pred
submission.to_csv('mini_submission.csv')
submission.head()

Unnamed: 0_level_0,winner
game_id,Unnamed: 1_level_1
38872,0.545845
38873,0.463037
38874,0.424968
38875,0.384724
38876,0.446188
