In [None]:
############################################################
############################################################
# 데이콘 3월 스타크래프트 경기 승률 예측 프로젝트
# dyddl1993@naver.com | https://github.com/saeu5407/
############################################################
############################################################

# 사용 라이브러리
# 시각화용 라이브러리는 노트북의 한계로 사용안함
print("=============================")
print("import lib")
print("=============================")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from bayes_opt import BayesianOptimization
from lightgbm import LGBMClassifier, plot_importance
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")

In [None]:
############################################################
# 기본 전처리용 함수들
# 각 플레이어의 행동을 카운트 한 변수들, 플레이어의 행동 차이를 계산한 변수
# 각 플레이어의 종족
############################################################
# 전처리용 함수 1
def make_feature(x, i, type='train'):
    a = x.loc[x.game_id == i, :]
    if type == 'test':
        a1 = a.iloc[0:1, 0:1].reset_index(drop=True)  # game id(test는 winner존재x)
    else:
        a1 = a.iloc[0:1, 0:2].reset_index(drop=True)  # game id와 winner
    a2 = a.apply('max', axis=0).time  # time의 max값
    a3 = a.groupby(['player', 'species'])['time'].count().reset_index()  # 0번, 1번 플레이어의 종족(교차곱하면 좋을듯)
    b = a.set_index(['game_id']).groupby(['player', 'event'])['time'].count().reset_index()
    b1 = b.loc[b.player == 0, ['event', 'time']]
    b2 = b.loc[b.player == 1, ['event', 'time']]
    c = pd.merge(b1, b2, on='event', how='outer').fillna(0)
    c = c.append({'event': 'ALL', 'time_x': c.apply('sum', axis=0).time_x, 'time_y': c.apply('sum', axis=0).time_y},
                 ignore_index=True)
    c['per_0'] = (c.time_x - c.time_y)
    c = c.T
    d = c.iloc[-1:, :].reset_index(drop=True)  # 0번 플레이어와 1번 플레이어의 행동 차이(각각, 전체)
    d.columns = c.iloc[0, :] + "_delta"
    e = c.iloc[1:2, :].reset_index(drop=True)  # 0번 플레이어
    e.columns = c.iloc[0, :] + "_0"
    f = c.iloc[2:3, :].reset_index(drop=True)  # 1번 플레이어
    f.columns = c.iloc[0, :] + "_1"

    df = pd.concat([a1, d, e, f, pd.Series(a2)], axis=1, ignore_index=True)
    df['0_class'] = a3.iloc[:, 1].reset_index(drop=True)[0]
    df['1_class'] = a3.iloc[:, 1].reset_index(drop=True)[1]
    df.columns = list(a1.columns) + list(d.columns) + list(e.columns) + list(f.columns) + ['time', '0_class',
                                                                                           '1_class']
    return (df)

# 전처리용 함수 2
def make_data(x, type='train'):
    print("=============================")
    print("start analysis")
    print("=============================")
    before_data = x.drop('event_contents', axis=1)
    column_list = pd.Series(['Ability', 'AddToControlGroup', 'Camera', 'ControlGroup', 'GetControlGroup',
                             'Right Click', 'Selection', 'SetControlGroup'])
    cl = ['game_id', 'winner'] + list(column_list + '_delta') + list(column_list + '_0') + list(
        column_list + '_1') + ['time', '0_class', '1_class']
    data = pd.DataFrame({}, columns=cl)
    start = x.game_id.min()
    end = x.game_id.max()
    for i in range(start, end + 1):
        # for i in range(0, 10): # for test
        if type == 'test':
            df = make_feature(before_data, i, type)
        else:
            df = make_feature(before_data, i)
        data = data.append(df, sort=False)
        if i % 3000 == 0:
            ii = i // 3000
            ij = end // 3000
            ik = ij - ii
            print("=" * ii + " " * ik + "|")
            print(str(i) + "/" + str(end) + " 작업 중..")
    print("=" * ij + "|")
    print(str(end) + "/" + str(end) + " 작업 완료!")
    return (data)

In [None]:
############################################################
# 행동 중 첫 카메라 좌표를 활용해 추가 feature 생성
# KNN Clustering을 이용, 각 플레이어 첫 카메라 좌표를 조합해 스타팅 포인트 예측
# 예측된 스타팅 포인트를 통해 경기에 사용된 map 생성(map 변수)
# 사용된 스타팅 포인트의 좌표 간 거리 계산(distance 변수)
############################################################
def eda_starting(train, type='train'):
    df_train = pd.DataFrame(train.game_id.unique(), columns=['game_id'])
    df_train.index = df_train.game_id
    df_train = df_train.drop(['game_id'], axis = 1)

    # 처음 기록 된 카메라 좌표를 기록
    df_train_p0 = train[(train.event=='Camera')&(train.player==0)]
    df_train_p0 = df_train_p0[df_train_p0.shift(1).game_id!=df_train_p0.game_id]
    if type=='train':
        df_train_p0 = df_train_p0.iloc[:, [0,6]].rename({'event_contents':'player0_starting'}, axis = 1)
    else:
        df_train_p0 = df_train_p0.iloc[:, [0,5]].rename({'event_contents': 'player0_starting'}, axis=1)
    df_train_p0.index = df_train_p0['game_id']
    df_train_p0 = df_train_p0.drop(['game_id'], axis=1)
    df_train = pd.merge(df_train, df_train_p0, on='game_id', how='left')
    del df_train_p0

    df_train_p1 = train[(train.event=='Camera')&(train.player==1)]
    df_train_p1 = df_train_p1[df_train_p1.shift(1).game_id!=df_train_p1.game_id]
    if type=='train':
        df_train_p1 = df_train_p1.iloc[:, [0,6]].rename({'event_contents':'player1_starting'}, axis=1)
    else:
        df_train_p1 = df_train_p1.iloc[:, [0,5]].rename({'event_contents': 'player1_starting'}, axis=1)
    df_train_p1.index = df_train_p1['game_id']
    df_train_p1 = df_train_p1.drop(['game_id'], axis=1)
    df_train = pd.merge(df_train, df_train_p1, on='game_id', how='left')
    del df_train_p1
    return(df_train)

In [None]:
# 기본 전처리 함수를 사용 첫 전처리 진행
if __name__ == '__main__':
    print("=============================")
    print("load data")
    print("=============================")
    data_path = ""
    sample_submission = pd.read_csv(data_path + "data/sample_submission.csv")

    if os.path.isfile(data_path + "output/dacon3_data.csv"):
        print("already have train data")
        data = pd.read_csv("output/dacon3_data.csv")
    else:
        train = pd.read_csv(data_path + "data/train.csv")
        data = make_data(train)
        data = data.fillna(0)
        data.to_csv("output/dacon3_data.csv", index=False)

    if os.path.isfile(data_path + "output/dacon3_test_data.csv"):
        print("already have test data")
        test_data = pd.read_csv("output/dacon3_test_data.csv")
    else:
        test = pd.read_csv(data_path + "data/test.csv")
        test_data = make_data(test, 'test')
        test_data = test_data.fillna(0)
        test_data.to_csv("output/dacon3_test_data.csv", index=False)

In [None]:
    # KNN 사용, 거리 계산을 통해 두 개의 변수 추가
    df1 = eda_starting(train)
    df2 = eda_starting(test, type='test')

    df_train = pd.concat([df1,df2], axis=0)


    # x, y 값으로 분리
    df_train['player0_starting'] = df_train.player0_starting.str.split('(').str[1].str.split(')').str[0]
    split_xy = df_train.player0_starting.str.split(',')
    df_train['player0_x'] = split_xy.str[0].astype('float')
    df_train['player0_y'] = split_xy.str[1].astype('float')
    del split_xy

    df_train['player1_starting'] = df_train.player1_starting.str.split('(').str[1].str.split(')').str[0]
    split_xy = df_train.player1_starting.str.split(',')
    df_train['player1_x'] = split_xy.str[0].astype('float')
    df_train['player1_y'] = split_xy.str[1].astype('float')
    del split_xy

    # 플레이어의 x,y 좌표를 하나로 모음
    location_p0 = df_train.loc[:, ['player0_x', 'player0_y']]
    location_p0 = location_p0.rename({'player0_x':'location_x', 'player0_y':'location_y'}, axis=1)
    location_p1 = df_train.loc[:, ['player1_x', 'player1_y']]
    location_p1 = location_p1.rename({'player1_x':'location_x', 'player1_y':'location_y'}, axis=1)
    location_p1.index += location_p0.index[-1]+1
    location = pd.concat([location_p0, location_p1])
    location = location.dropna()
    del location_p0, location_p1
    df_train.player0_starting.value_counts().head(20)


    kmeans_clst = KMeans(n_clusters=15).fit(location)
    location['starting'] = kmeans_clst.labels_+1

    for cluster in range(15):
        point = location[location.starting==cluster+1]
        loc = point.loc[:,['location_x', 'location_y']]
        del point
        loc['center_x'] = kmeans_clst.cluster_centers_[cluster][0]
        loc['center_y'] = kmeans_clst.cluster_centers_[cluster][1]
        distance = np.sqrt(np. square(loc.location_x - loc.center_x) + np.square(loc.location_y - loc.center_y))
        location.loc[loc.index, 'distance'] = distance
        del loc

    idx = location[location.distance>5].index
    location.loc[idx, 'starting'] = 0
    del idx

    df_train['player0_starting'] = location.loc[df_train.index, 'starting']
    location.index -= (df_train.index[-1]+1)
    df_train['player1_starting'] = location.loc[df_train.index, 'starting']
    del location

    df_train = df_train.fillna(0)

    map_list = []
    for point in range(1,16):
        couple = df_train[df_train.player0_starting == point].player1_starting.value_counts()
        if couple[couple.index[1]]<100:
            map_list.append([point, couple.index[0], 999])
        else:
            map_list.append([point, couple.index[0], couple.index[1]])
    map_list = np.sort(map_list, axis = 1)
    map_list = np.unique(map_list, axis = 0)
    for m in map_list:
        idx = df_train[(df_train.player0_starting == 0) & (
                    (df_train.player1_starting == m[0]) | (df_train.player1_starting == m[2]))].index
        df_train.loc[idx, 'player0_starting'] = m[1]
        del idx
        idx = df_train[(df_train.player0_starting == 0) & (
                    (df_train.player1_starting == m[1]) | (df_train.player1_starting == m[2]))].index
        df_train.loc[idx, 'player0_starting'] = m[0]
        del idx

        idx = df_train[(df_train.player1_starting == 0) & (
                    (df_train.player0_starting == m[0]) | (df_train.player0_starting == m[2]))].index
        df_train.loc[idx, 'player1_starting'] = m[1]
        del idx
        idx = df_train[(df_train.player1_starting == 0) & (
                    (df_train.player0_starting == m[1]) | (df_train.player0_starting == m[2]))].index
        df_train.loc[idx, 'player1_starting'] = m[0]
        del idx
    df_train[(df_train.player0_starting == 0) | (df_train.player1_starting == 0)].head()



    for map_num, m in enumerate(map_list):
        idx = df_train[(df_train.player0_starting == m[0])|(df_train.player0_starting == m[1])|(df_train.player0_starting == m[2])].index
        df_train.loc[idx, 'map'] = map_num
    del idx, map_list
    df_train.head()

    df_train['distance_delta'] = np.sqrt(np. square(df_train.player0_x - df_train.player1_x)
                                         + np.square(df_train.player0_y - df_train.player1_y))
    df_train = df_train.loc[:,['map', 'distance_delta']]

    clu_train = df_train.loc[train.game_id.unique(), :]
    clu_test = df_train.loc[test.game_id.unique(), :]
    clu_train.to_csv("clust_map_train.csv")
    clu_test.to_csv("clust_map_test.csv")

In [None]:
    ############################################################
    # 최종 merge 및 인코딩, 변수 정리
    # 이후 모델링 진행
    ############################################################
    
    
    # 의미 없다고 판단한 변수인 각 플레이어의 행동 카운트 제거
    # 플레이어 간 카운트 차이가 유의하다고 판단.
    # 지금 생각해보면 각 플레이어간 행동을 통해 플레이어를 클러스터링해서 고승률 유저를 구분하는건 어땠을까 싶음
    data = pd.read_csv("output/dacon3_data.csv")
    clu = pd.read_csv("clust_map_train.csv")
    data2 = pd.merge(data, clu, how='inner', on='game_id')
    data = data2.loc[:, ['game_id', 'winner', 'Ability_delta', 'AddToControlGroup_delta',
           'Camera_delta', 'ControlGroup_delta', 'GetControlGroup_delta',
           'Right Click_delta', 'Selection_delta', 'time', '0_class', '1_class', 'map', 'distance_delta']]
    #data = data2.drop(['ALL_delta', 'ALL_0', 'ALL_1'], axis=1)

    test_data = pd.read_csv("output/dacon3_test_data.csv")
    clu = pd.read_csv("clust_map_test.csv")
    data2 = pd.merge(test_data, clu, how='inner', on='game_id')
    test_data = data2.loc[:, ['game_id', 'winner', 'Ability_delta', 'AddToControlGroup_delta',
           'Camera_delta', 'ControlGroup_delta', 'GetControlGroup_delta',
           'Right Click_delta', 'Selection_delta', 'time', '0_class', '1_class', 'map', 'distance_delta']]
    #test_data = data2.drop(['ALL_delta', 'ALL_0', 'ALL_1'], axis=1)

    # 스케일링은 크게 차이가 없어서 주석처리
    """
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(data[data.columns.difference(['game_id', 'winner', '0_class', '1_class','map'])])
    data[data.columns.difference(['game_id', 'winner', '0_class', '1_class','map'])] = scaler.transform(
        data[data.columns.difference(['game_id', 'winner', '0_class', '1_class','map'])])
    test_data[data.columns.difference(['game_id', 'winner', '0_class', '1_class','map'])] = scaler.transform(
        test_data[test_data.columns.difference(['game_id', 'winner', '0_class', '1_class','map'])])
    """
    
    # 종족간 승률을 보기 위해 상세 변수 생성, 필요한 자료 인코딩 진행
    data['vs_class'] = data['0_class'] + data['1_class']
    test_data['vs_class'] = test_data['0_class'] + test_data['1_class']

    data['0_class'] = data['0_class'].str.replace('T','0').replace('Z','1').replace('P','2').astype('int')
    data['1_class'] = data['1_class'].str.replace('T','0').replace('Z','1').replace('P','2').astype('int')
    test_data['0_class'] = test_data['0_class'].str.replace('T','0').replace('Z','1').replace('P','2').astype('int')
    test_data['1_class'] = test_data['1_class'].str.replace('T','0').replace('Z','1').replace('P','2').astype('int')
    data = data.set_index('game_id')
    test_data = test_data.set_index('game_id')
    test = test_data.drop(['winner'], axis=1)
    x_train = data[data.columns.difference(['winner'])]
    y_train = data.winner

    def onehot(data, col, col_name=None):
        onehot = OneHotEncoder()
        x = onehot.fit_transform(data[col].values.reshape(-1, 1)).toarray()
        if col_name == None:
            onehot_df = pd.DataFrame(x, columns=onehot.get_feature_names(), index=data.index)
        else:
            onehot_df = pd.DataFrame(x, columns=col_name, index=data.index)
        x2 = pd.concat([data, onehot_df], axis=1)
        x2 = x2.drop(col, axis=1)
        return(x2)

    # 사용하면 안 될 변수인 같은 종족전 제거(이것도 플레이어를 클러스터링했으면 사용할만 했을듯)
    x_train = onehot(data=x_train, col='vs_class')
    test = onehot(data=test, col='vs_class')
    x_train = x_train.drop(['x0_PP', 'x0_TT', 'x0_ZZ',
                            ], axis=1)
    test = test.drop(['x0_PP', 'x0_TT', 'x0_ZZ',
                      ], axis=1)
    
    # 굳이 원핫인코딩이 필요없는 자료는 레이블인코딩으로 진행, 교차곱은 의미없어서 주석처리
    """
    map_col = ['map_0', 'map_1', 'map_2', 'map_3', 'map_4', 'map_5', 'map_6']
    x_train = onehot(data=x_train,col='map',col_name=map_col)
    test = onehot(data=test,col='map',col_name=map_col)
    c0_col = ['0_0','0_1','0_2']
    c1_col = ['1_0','1_1','1_2']
    x_train = onehot(data=x_train,col='0_class',col_name=c0_col)
    x_train = onehot(data=x_train,col='1_class',col_name=c1_col)
    test = onehot(data=test,col='0_class',col_name=c0_col)
    test = onehot(data=test,col='1_class',col_name=c1_col)
    """
    """
    def poly_col(a1, a2):
        data2 = pd.DataFrame({}, index=a1.index)
        len_a = len(a1.columns); len_b = len(a2.columns)
        for j in range(len_b):
            for i in range(len_a):
                a3 = a1.iloc[:,range(len_a)[i]] * a2.iloc[:,range(len_b)[j]]
                a3 = pd.DataFrame({a2.columns[j][-2:] + "/" + a1.columns[i] : a3})
                data2 = pd.concat([data2,a3],axis=1)
        return(data2)

    a1 = x_train.iloc[:,-7:]
    a2 = x_train.iloc[:,-13:-7]

    x_train = pd.concat([x_train, poly_col(a1,a2)], axis=1)

    x_train = x_train.drop(['x0_PT', 'x0_PZ', 'x0_TP', 'x0_TZ', 'x0_ZP', 'x0_ZT', 'map_0','map_1','map_2','map_3','map_4','map_5','map_6'], axis=1)
    """

    """
    x_train = x_train.drop(['x0_0', 'x0_1', 'x0_2', 'x0_P',
           'x0_T', 'x0_Z'], axis=1)
    test = test.drop(['x0_0', 'x0_1', 'x0_2', 'x0_P',
           'x0_T', 'x0_Z'], axis=1)
    """

    # 데이터 자체가 10분 전후로 로그를 끊은거라 시간 컬럼이 무의미해서 제거
    x_train2 = x_train
    x_train = x_train.drop('time', axis=1)
    test2 = test
    test = test.drop('time', axis=1)

    """
    x_train = x_train.drop(['x0_PT', 'x0_PZ', 'x0_TP', 'x0_TZ', 'x0_ZP', 'x0_ZT'], axis=1)
    test = test.drop(['x0_PT', 'x0_PZ', 'x0_TP', 'x0_TZ', 'x0_ZP', 'x0_ZT'], axis=1)
    """

    # 원핫인코딩 하지 않은 자료 카테고리화
    x_train['0_class'] = x_train['0_class'].astype('category')
    x_train['0_class'] = x_train['0_class'].astype('category')
    x_train['map']= x_train['map'].astype('category')

    test['0_class'] = test['0_class'].astype('category')
    test['0_class'] = test['0_class'].astype('category')
    test['map']= test['map'].astype('category')

    # 간단하게 모델을 돌려봄
    x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=120)
    model = LGBMClassifier(n_estimators=1000, learning_rate=0.01, reg_lambda=1)
    model.fit(x_tr, y_tr, early_stopping_rounds=200, eval_metric='auc', eval_set=[(x_val, y_val)])

    pred_test = model.predict_proba(test)[:,1]
    pd.DataFrame({'winner' : pred_test}, index=test.index).to_csv('output/time_sub.csv')
    plot_importance(model)

    # vs_class가 all보다 나음 / 0 ,1 로(no onehot) 한번 해보고 둘 중 하나만 쓰자
    # map은 그냥 둠 0.6448

In [None]:
    ############################################################
    # 베이지안 옵티마이저를 통해 모델 튜닝 진행
    # lightGBM 사용
    ############################################################
    def partial(func, *args, **keywords):
        def newfunc(*fargs, **fkeywords):
            newkeywords = keywords.copy()
            newkeywords.update(fkeywords)
            return func(*args, *fargs, **newkeywords)
        newfunc.func = func
        newfunc.args = args
        newfunc.keywords = keywords
        return newfunc

    from sklearn.metrics import roc_auc_score   # AUC 스코어 계산
    from sklearn.model_selection import KFold   # K-fold CV
    from bayes_opt import BayesianOptimization
    import lightgbm as lgb

    def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda, x_data=None,
               y_data=None, n_splits=5, output='score'):
        score = 0
        kf = KFold(n_splits=n_splits)
        models = []
        for train_index, valid_index in kf.split(x_data):
            x_train, y_train = x_data.iloc[train_index], y_data.iloc[train_index]
            x_valid, y_valid = x_data.iloc[valid_index], y_data.iloc[valid_index]

            model = lgb.LGBMClassifier(
                num_leaves=int(num_leaves),
                learning_rate=learning_rate,
                n_estimators=int(n_estimators),
                subsample=np.clip(subsample, 0, 1),
                colsample_bytree=np.clip(colsample_bytree, 0, 1),
                reg_alpha=reg_alpha,
                reg_lambda=reg_lambda,
            )

            model.fit(x_train, y_train)
            models.append(model)

            pred = model.predict_proba(x_valid)[:, 1]
            true = y_valid
            score += roc_auc_score(true, pred) / n_splits

        if output == 'score':
            return score
        if output == 'model':
            return models

    func_fixed = partial(lgb_cv, x_data=x_train, y_data=y_train, n_splits=5, output='score')
    # 베이지안 최적화 범위 설정
    lgbBO = BayesianOptimization(
        func_fixed,
        {
            'num_leaves': (16, 1024),        # num_leaves,       범위(16~1024)
            'learning_rate': (0.0001, 0.1),  # learning_rate,    범위(0.0001~0.1)
            'n_estimators': (16, 1024),      # n_estimators,     범위(16~1024)
            'subsample': (0, 1),             # subsample,        범위(0~1)
            'colsample_bytree': (0, 1),      # colsample_bytree, 범위(0~1)
            'reg_alpha': (0, 10),            # reg_alpha,        범위(0~10)
            'reg_lambda': (0, 50),           # reg_lambda,       범위(0~50)
        },
        random_state=1025                    # 시드 고정
    )
    lgbBO.maximize(init_points=5, n_iter=30) # 처음 5회 랜덤 값으로 score 계산 후 30회 최적화

    params = lgbBO.max['params']
    models = lgb_cv(
        params['num_leaves'],
        params['learning_rate'],
        params['n_estimators'],
        params['subsample'],
        params['colsample_bytree'],
        params['reg_alpha'],
        params['reg_lambda'],
        x_data=x_train, y_data=y_train, n_splits=5, output='model')

    preds = []
    for model in models:
        pred = model.predict_proba(test)[:, 1]
        preds.append(pred)
    pred = np.mean(preds, axis=0)

    pd.DataFrame({'winner' : pred}, index=test.index).to_csv('submission_pred.csv')

In [None]:
############################################################
############################################################
# 최종 0.628로 총 500명 중 전체 60등 정도지만 베이스라인으로 제시된 코드보다 낮음
# 다음 대회부터는 eda부터 상세히 진행해봐야 할 듯
# 아직 갈 길이 멀다
############################################################
############################################################