In [2]:
import pandas as pd
from pycaret.classification import *

In [3]:
# 데이터 로딩
match_data = pd.read_csv('/Users/lhe339/Documents/GitHub/nextorial/data/데이터 분석가_과제_Type A/match_data.csv')
test_data = pd.read_csv('/Users/lhe339/Documents/GitHub/nextorial/data/데이터 분석가_과제_Type A/test_data.csv')

In [4]:
def calculate_team_features(data):
    return data.groupby(['matchid', 'teamid']).agg({
        'mmr': 'mean',
        'winstreak': 'mean',
        'losestreak': 'mean',
        'recentwinprob': 'mean'
    }).reset_index()

# match_data에 대한 전처리
match_team_features = calculate_team_features(match_data)

# 원래 match_data와 merge하여 각 유저 옆에 팀의 평균 특성값 추가하기
match_data = match_data.merge(match_team_features, on=['matchid', 'teamid'], suffixes=('', '_team_avg'))

In [5]:
def guild_avg(df):
    # 'guildid'에서 NaN 값을 처리합니다. 이 예에서는 'NoGuild'로 대체합니다.
    df['guildid'].fillna('NoGuild', inplace=True)

    # 티어를 숫자로 변환합니다.
    tier_mapping = {'unranked': 0, 'bronze': 1, 'silver': 2, 'gold': 3, 'platinum': 4, 'diamond': 5, 'master': 6}
    df['tier_numeric'] = df['tier'].map(tier_mapping)

    # 길드별로 데이터를 그룹화하고 그룹별로 평균을 계산합니다.
    guild_avg = df.groupby('guildid')[['mmr', 'winstreak', 'recentwinprob', 'accumatches', 'tier_numeric']].mean()

    # 새로운 컬럼 이름을 생성합니다.
    new_columns = {col: f'{col}_guild_avg' for col in guild_avg.columns}
    guild_avg.rename(columns=new_columns, inplace=True)

    # 원본 데이터셋에 계산된 길드 성능 지표를 병합합니다.
    df = pd.merge(df, guild_avg, on='guildid', how='left')
    
    return df


match_data = guild_avg(match_data)

In [6]:
def normalize_column(df, column):
    df[f'normalized_{column}'] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    return df

match_data = normalize_column(match_data, 'accumatches')

In [7]:

def compute_team_stats(df):
    grouped = df.groupby(['matchid', 'teamid'])
    df['team_max_accumatches'] = grouped['accumatches'].transform('max')
    df['team_min_accumatches'] = grouped['accumatches'].transform('min')
    df['accumatches_diff'] = df['team_max_accumatches'] - df['team_min_accumatches']
    df['accumatches_variance'] = grouped['accumatches'].transform('var')
    return df

match_data = compute_team_stats(match_data)

In [8]:
def compute_recent_performance_index(df):
    df['recent_performance_index'] = df['winstreak'] * df['recentwinprob']
    return df

match_data = compute_recent_performance_index(match_data)

In [9]:
def process_guild_info(df, threshold):
    guild_mean_mmr = df.groupby('guildid')['mmr'].mean()
    df['guild_mean_mmr'] = df['guildid'].map(guild_mean_mmr)
    df['high_mmr_guild'] = (df['guild_mean_mmr'] > threshold).astype(int)
    return df

match_data = process_guild_info(match_data, 2000)

In [10]:
def compute_mmr_diff_and_variance(df):
    mmr_diff_grouped = df.groupby('teamid')['mmr'].agg(['max', 'min'])
    df['mmr_diff'] = df['teamid'].map(mmr_diff_grouped['max'] - mmr_diff_grouped['min'])
    mmr_variance_grouped = df.groupby('teamid')['mmr'].var()
    df['mmr_variance'] = df['teamid'].map(mmr_variance_grouped)
    return df

match_data = compute_mmr_diff_and_variance(match_data)

In [11]:
def compute_recent_winprob_stats(df):
    grouped = df.groupby('matchid')
    df['recentwinprob_max'] = grouped['recentwinprob'].transform('max')
    df['recentwinprob_min'] = grouped['recentwinprob'].transform('min')
    df['recentwinprob_diff'] = df['recentwinprob_max'] - df['recentwinprob_min']
    df['recentwinprob_mean'] = grouped['recentwinprob'].transform('mean')
    df['recentwinprob_diff_from_mean'] = (df['recentwinprob'] - df['recentwinprob_mean'])**2
    df['recentwinprob_variance'] = grouped['recentwinprob_diff_from_mean'].transform('mean')
    return df

match_data = compute_recent_winprob_stats(match_data)

In [12]:
def convert_tier_to_numeric(tier):
    tier_dict = {
        'unranked': 0,
        'bronze': 1,
        'silver': 2,
        'gold': 3,
        'platinum': 4,
        'diamond': 5,
        'master': 6
    }
    return tier_dict.get(tier, -1)  # -1 for any unexpected tier value

# DataFrame의 'tier' 열에 함수 적용
match_data['tier_numeric'] = match_data['tier'].apply(convert_tier_to_numeric)

In [13]:
def apply_tier_conversion_and_compute_average(df):
    df['tier_numeric'] = df['tier'].apply(convert_tier_to_numeric)
    average_tier = df.groupby(['matchid', 'teamid'])['tier_numeric'].mean().reset_index()
    average_tier.rename(columns={'tier_numeric': 'average_tier'}, inplace=True)
    df = df.merge(average_tier, on=['matchid', 'teamid'])
    return df

match_data = apply_tier_conversion_and_compute_average(match_data)

In [14]:
def calculate_streak_rate(row):
    winstreak, losestreak = row['winstreak'], row['losestreak']
    if winstreak + losestreak == 0:
        return 0
    return winstreak / (winstreak + losestreak)

def compute_streak_rate(df):
    df['streak_rate'] = df.apply(calculate_streak_rate, axis=1)
    return df

# 사용 예
match_data = compute_streak_rate(match_data)

In [15]:
# PyCaret 설정
clf1 = setup(data = match_data,
             target = 'matchresult',
             ignore_features = ['createdatekst', 'matchid', 'accountid', 'guildid'],  # 무시할 특성
             ordinal_features = {'tier': ['unranked', 'bronze', 'silver', 'gold', 'platinum', 'diamond', 'master']},  # 순서형 특성
             session_id=123)  # 재현 가능성을 위한 세션 ID 설정

Unnamed: 0,Description,Value
0,Session id,123
1,Target,matchresult
2,Target type,Binary
3,Original data shape,"(410384, 44)"
4,Transformed data shape,"(410384, 46)"
5,Transformed train set shape,"(287268, 46)"
6,Transformed test set shape,"(123116, 46)"
7,Ignore features,4
8,Ordinal features,1
9,Numeric features,38


In [16]:
# 모델 비교를 통해 최적의 모델 선택
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.6345,0.6866,0.6479,0.6311,0.6393,0.2691,0.2692,33.045
et,Extra Trees Classifier,0.6066,0.6509,0.6303,0.6018,0.6157,0.2133,0.2135,38.392
gbc,Gradient Boosting Classifier,0.5885,0.6316,0.6289,0.5819,0.6045,0.1771,0.1777,48.356
ada,Ada Boost Classifier,0.5817,0.6215,0.6099,0.5774,0.5932,0.1635,0.1637,10.092
ridge,Ridge Classifier,0.5757,0.0,0.6595,0.5648,0.6085,0.1514,0.1536,1.065
lda,Linear Discriminant Analysis,0.5757,0.6103,0.6593,0.5648,0.6084,0.1513,0.1535,3.012
knn,K Neighbors Classifier,0.565,0.5882,0.5672,0.5648,0.566,0.1301,0.1301,34.297
dt,Decision Tree Classifier,0.55,0.55,0.5502,0.55,0.5501,0.0999,0.0999,4.401
nb,Naive Bayes,0.5373,0.5537,0.768,0.5255,0.624,0.0746,0.084,1.228
lr,Logistic Regression,0.5306,0.5485,0.5387,0.5295,0.5335,0.0612,0.0614,4.088


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

In [None]:
# 모델 생성 및 튜닝 (예: Light Gradient Boosting Machine 선택)
lgbm = create_model('lightgbm')
tuned_lgbm = tune_model(lgbm)

In [None]:
pc.evaluate_model(lightgbm_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
predictions = pc.predict_model(lightgbm_model, data=test_data)

KeyError: "['matchscore', 'isDrop', 'isEscape', 'tier_numeric', 'mmr_guild_avg', 'winstreak_guild_avg', 'recentwinprob_guild_avg', 'accumatches_guild_avg', 'tier_numeric_guild_avg'] not in index"