In [1]:
! pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0


In [2]:
! pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.8/226.8 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.1 colorlog-6.7.0 optuna-3.4.0


In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [4]:
match_data = pd.read_csv('/content/drive/MyDrive/nextorial/data/match_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/nextorial/data/test_data.csv')

In [5]:
class DataProcessor:

    def __init__(self, df):
        self.data = df

    @staticmethod
    # def convert_tier_to_numeric(tier):
    #     tier_dict = {
    #         'unranked': 0,
    #         'bronze': 1,
    #         'silver': 2,
    #         'gold': 3,
    #         'platinum': 4,
    #         'diamond': 5,
    #         'master': 6
    #     }
    #     return tier_dict.get(tier, -1)

    @staticmethod
    def convert_tier_to_exponential_weight(tier):
        tier_dict = {
            'unranked': 0,
            'bronze': 1,
            'silver': 2,
            'gold': 3,
            'platinum': 4,
            'diamond': 5,
            'master': 6
        }
        tier_numeric = tier_dict.get(tier, -1)
        return np.exp(tier_numeric)

    def add_tier_exponential_weight(self):
        self.data['tier_exp_weight'] = self.data['tier'].apply(self.convert_tier_to_exponential_weight)
        return self.data

    # def calculate_team_features(self):
    #     return self.data.groupby(['matchid', 'teamid']).agg({
    #         'mmr': 'mean',
    #         'winstreak': 'mean',
    #         'losestreak': 'mean',
    #         'recentwinprob': 'mean'
    #     }).reset_index()

    def preprocess(self):
        # match_team_features = self.calculate_team_features()
        # self.data = self.data.merge(match_team_features, on=['matchid', 'teamid'], suffixes=('', '_team_avg'))
        self.data = self.normalize_column('accumatches')
        self.data = self.compute_team_stats()
        self.data = self.compute_recent_performance_index()
        self.data = self.process_guild_info(202068.571428571428400)
        self.data = self.compute_mmr_diff_and_variance()
        self.data = self.compute_recent_winprob_stats()
        self.data = self.apply_tier_conversion_and_compute_average()
        self.data = self.compute_streak_rate()
        self.data = self.guild_mean()
        self.data = self.guild_median()
        self.data = self.guild_mode()
        self.data = self.add_tier_exponential_weight()

    # guild_membership
    def guild_mean(self):
        df = self.data.copy()
        df['guildid'].fillna('NoGuild', inplace=True)
        df['tier_numeric'] = df['tier'].map(self.convert_tier_to_exponential_weight)
        guild_mean = df.groupby('guildid')[['mmr', 'winstreak', 'recentwinprob', 'accumatches', 'tier_numeric']].mean()
        new_columns = {col: f'{col}guild_mean' for col in guild_mean.columns}
        guild_mean.rename(columns=new_columns, inplace=True)
        df = pd.merge(df, guild_mean, on='guildid', how='left')
        return df

    def guild_median(self):
        df = self.data.copy()
        df['guildid'].fillna('NoGuild', inplace=True)
        df['tier_numeric'] = df['tier'].map(self.convert_tier_to_exponential_weight)
        guild_median = df.groupby('guildid')[['mmr', 'winstreak', 'recentwinprob', 'accumatches', 'tier_numeric']].median()
        new_columns = {col: f'{col}guild_median' for col in guild_median.columns}
        guild_median.rename(columns=new_columns, inplace=True)
        df = pd.merge(df, guild_median, on='guildid', how='left')
        return df

    def guild_mode(self):
        df = self.data.copy()
        df['guildid'].fillna('NoGuild', inplace=True)
        df['tier_numeric'] = df['tier'].map(self.convert_tier_to_exponential_weight)

        def calculate_mode(group):
            return group.mode().iloc[0]

        guild_mode = df.groupby('guildid')[['mmr', 'winstreak', 'recentwinprob', 'accumatches', 'tier_numeric']].apply(calculate_mode)
        new_columns = {col: f'{col}guild_mode' for col in guild_mode.columns}
        guild_mode.rename(columns=new_columns, inplace=True)
        df = pd.merge(df, guild_mode, on='guildid', how='left')
        return df

    def normalize_column(self, column):
        self.data[f'normalized_{column}'] = (self.data[column] - self.data[column].min()) / (self.data[column].max() - self.data[column].min())
        return self.data

    def compute_team_stats(self):
        grouped = self.data.groupby(['matchid', 'teamid'])
        self.data['team_max_accumatches'] = grouped['accumatches'].transform('max')
        self.data['team_min_accumatches'] = grouped['accumatches'].transform('min')
        self.data['accumatches_diff'] = self.data['team_max_accumatches'] - self.data['team_min_accumatches']
        self.data['accumatches_variance'] = grouped['accumatches'].transform('var')
        return self.data

    def compute_recent_performance_index(self):
        self.data['recent_performance_index'] = self.data['winstreak'] * self.data['recentwinprob']
        return self.data

    def process_guild_info(self, threshold):
        guild_mean_mmr = self.data.groupby('guildid')['mmr'].mean()
        self.data['guild_mean_mmr'] = self.data['guildid'].map(guild_mean_mmr)
        self.data['high_mmr_guild'] = (self.data['guild_mean_mmr'] > threshold).astype(int)
        return self.data

    def compute_mmr_diff_and_variance(self):
        mmr_diff_grouped = self.data.groupby('teamid')['mmr'].agg(['max', 'min'])
        self.data['mmr_diff'] = self.data['teamid'].map(mmr_diff_grouped['max'] - mmr_diff_grouped['min'])
        mmr_variance_grouped = self.data.groupby('teamid')['mmr'].var()
        self.data['mmr_variance'] = self.data['teamid'].map(mmr_variance_grouped)
        return self.data

    def compute_recent_winprob_stats(self):
        grouped = self.data.groupby('matchid')
        self.data['recentwinprob_max'] = grouped['recentwinprob'].transform('max')
        self.data['recentwinprob_min'] = grouped['recentwinprob'].transform('min')
        self.data['recentwinprob_diff'] = self.data['recentwinprob_max'] - self.data['recentwinprob_min']
        self.data['recentwinprob_mean'] = grouped['recentwinprob'].transform('mean')
        self.data['recentwinprob_diff_from_mean'] = (self.data['recentwinprob'] - self.data['recentwinprob_mean'])**2
        self.data['recentwinprob_variance'] = grouped['recentwinprob_diff_from_mean'].transform('mean')
        return self.data

    def apply_tier_conversion_and_compute_average(self):
        self.data['tier_numeric'] = self.data['tier'].apply(self.convert_tier_to_exponential_weight)
        average_tier = self.data.groupby(['matchid', 'teamid'])['tier_numeric'].mean().reset_index()
        average_tier.rename(columns={'tier_numeric': 'average_tier'}, inplace=True)
        self.data = self.data.merge(average_tier, on=['matchid', 'teamid'])
        return self.data

    @staticmethod
    def calculate_streak_rate(row):
        winstreak, losestreak = row['winstreak'], row['losestreak']
        if winstreak + losestreak == 0:
            return 0
        return winstreak / (winstreak + losestreak)

    def compute_streak_rate(self):
        self.data['streak_rate'] = self.data.apply(self.calculate_streak_rate, axis=1)
        return self.data

In [6]:
processor = DataProcessor(match_data)
processor.preprocess()
processed_data = processor.data

In [None]:
# # Processed data is already defined in the context, we'll just export it to a CSV file
# processed_data.to_csv('/content/drive/MyDrive/nextorial/data/processed_match_data.csv', index=False)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# 데이터에서 특성과 타겟 변수를 분리합니다.
target = 'matchresult'
# Drop identifiers and the original 'tier' feature
features_to_drop = ['createdatekst', 'matchid', 'accountid', 'guildid', 'tier', 'matchscore', 'isDrop', 'isEscape']
features = processed_data.columns.drop([target] + features_to_drop)
X = processed_data[features]
y = processed_data[target]

# 수치형 및 범주형 데이터를 위한 변환기 정의
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# 순서형 특성은 이미 'tier_numeric'으로 변환되어 있으므로 여기서는 처리하지 않습니다.

# 수치형 및 범주형 데이터에 대한 파이프라인을 만듭니다.
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))  # NaN 값을 중앙값으로 대체
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # NaN 값을 최빈값으로 대체
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # 원-핫 인코딩
])

# ColumnTransformer를 생성하여 변환기를 결합합니다.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'  # 나머지 열은 변경하지 않고 유지
)

# 전처리를 특성 데이터에 적용합니다.
X_encoded = preprocessor.fit_transform(X)

# 데이터셋을 훈련 세트와 검증/테스트 세트로 분할합니다.
X_temp, X_test, y_temp, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2


In [None]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
# from pytorch_tabnet.tab_model import TabNetClassifier

# clf = TabNetClassifier()
# clf.fit(
#     X_train, y_train,
#     eval_set=[(X_valid, y_valid)],
#     batch_size = 8,
#     max_epochs = 2,
# )
# preds = clf.predict(X_test)



epoch 0  | loss: 0.68641 | val_0_auc: 0.58574 |  0:08:25s
epoch 1  | loss: 0.68401 | val_0_auc: 0.58654 |  0:17:22s
Stop training because you reached max_epochs = 2 with best_epoch = 1 and best_val_0_auc = 0.58654




In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier

clf = TabNetClassifier()
clf.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    batch_size = 4096,
    max_epochs = 50,
)
preds = clf.predict(X_test)



epoch 0  | loss: 0.68944 | val_0_auc: 0.57001 |  0:00:12s
epoch 1  | loss: 0.67928 | val_0_auc: 0.59253 |  0:00:24s
epoch 2  | loss: 0.678   | val_0_auc: 0.59655 |  0:00:36s
epoch 3  | loss: 0.67802 | val_0_auc: 0.59728 |  0:00:48s
epoch 4  | loss: 0.67714 | val_0_auc: 0.59968 |  0:01:00s
epoch 5  | loss: 0.67629 | val_0_auc: 0.60159 |  0:01:12s
epoch 6  | loss: 0.67559 | val_0_auc: 0.60327 |  0:01:24s
epoch 7  | loss: 0.67462 | val_0_auc: 0.60496 |  0:01:36s
epoch 8  | loss: 0.67441 | val_0_auc: 0.60629 |  0:01:48s
epoch 9  | loss: 0.67373 | val_0_auc: 0.60862 |  0:02:00s
epoch 10 | loss: 0.67488 | val_0_auc: 0.5941  |  0:02:13s
epoch 11 | loss: 0.6761  | val_0_auc: 0.60057 |  0:02:25s
epoch 12 | loss: 0.67553 | val_0_auc: 0.6047  |  0:02:37s
epoch 13 | loss: 0.67401 | val_0_auc: 0.60788 |  0:02:49s
epoch 14 | loss: 0.67305 | val_0_auc: 0.60875 |  0:03:01s
epoch 15 | loss: 0.67302 | val_0_auc: 0.60912 |  0:03:13s
epoch 16 | loss: 0.67268 | val_0_auc: 0.61012 |  0:03:26s
epoch 17 | los

