In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
import json
import numpy as np
from scipy.stats import mode
import statsmodels.api as sm

# 매치 로그 데이터 로드
with open('match_logs.json', 'r', encoding='utf8') as file:
    match_logs = json.load(file)

# 매치 로그를 데이터프레임으로 변환
match_logs_list = []
for date, events in match_logs.items():
    for event in events:
        try:
            minute_str = event['time'].strip().split('’')[0]
            if '+' in minute_str:
                base_minute, added_minute = map(int, minute_str.split('+'))
                minute = base_minute + added_minute
            else:
                minute = int(minute_str)
            match_logs_list.append({"date": date, "minute": minute, "type": event['type']})
        except ValueError:
            continue

df_match_logs = pd.DataFrame(match_logs_list)

# 날짜 형식 변환
df_match_logs['date'] = pd.to_datetime(df_match_logs['date'])

# 원핫 인코딩
df_match_logs = pd.concat([df_match_logs, pd.get_dummies(df_match_logs['type'])], axis=1).drop(columns=['type'])

# 원핫 인코딩된 데이터프레임 표시
df_match_logs.head()

Unnamed: 0,date,minute,assist,goal,own_goal,penalty_goal,penalty_miss,red_card,substitute_in,substitute_out,yellow_card
0,2011-08-13,61,0,1,0,0,0,0,0,0,0
1,2011-08-20,87,0,0,0,0,0,0,0,1,0
2,2011-08-27,62,0,1,0,0,0,0,0,0,0
3,2011-08-27,75,0,0,0,0,0,0,0,1,0
4,2011-09-17,56,0,0,0,0,0,0,1,0,0


In [2]:
# 손흥민 선수 경기 데이터 로드
with open('f_son_heung_min_match_stats.json', 'r', encoding='utf8') as file:
    son_stats = json.load(file)

# 데이터를 딕셔너리 리스트로 변환
son_stats_list = []
for date, stats in son_stats.items():
    stats['date'] = date
    son_stats_list.append(stats)

# 리스트를 데이터프레임으로 변환
son_stats_df = pd.DataFrame(son_stats_list)

# 'date' 열을 datetime 형식으로 변환
son_stats_df['date'] = pd.to_datetime(son_stats_df['date'])

# 'result' 값을 0, -1, 1로 매핑
result_mapping = {'Drew': 0, 'Loose': -1, 'Win': 1}
son_stats_df['result'] = son_stats_df['result'].map(result_mapping)

# 변경된 데이터프레임의 첫 몇 행을 출력
son_stats_df.head()

Unnamed: 0,dayofweek,comp,round,venue,result,team,opponent,game_started,position,minutes,...,progressive_carries,carries_into_final_third,carries_into_penalty_area,miscontrols,dispossessed,passes_received,progressive_passes_received,win_score,loose_score,date
0,Sat,Bundesliga,Matchweek 2,Home,0,Hamburger SV,Hertha BSC,Y,,90,...,,,,,,,,2,2,2011-08-13
1,Sat,Bundesliga,Matchweek 3,Away,-1,Hamburger SV,Bayern Munich,Y,,86,...,,,,,,,,0,5,2011-08-20
2,Sat,Bundesliga,Matchweek 4,Home,-1,Hamburger SV,Köln,Y,,74,...,,,,,,,,3,4,2011-08-27
3,Sat,Bundesliga,Matchweek 6,Home,-1,Hamburger SV,Gladbach,N,,35,...,,,,,,,,0,1,2011-09-17
4,Fri,Bundesliga,Matchweek 7,Away,1,Hamburger SV,Stuttgart,Y,,80,...,,,,,,,,2,1,2011-09-23


In [3]:
# 매치 로그 데이터와 손흥민 선수 경기 데이터 병합
merged_df = pd.merge(df_match_logs, son_stats_df, how='inner', on='date')

# 병합된 데이터 확인
merged_df.head()

merged_df.to_csv('data.csv', index=False, encoding='utf8')

In [4]:
'''
중앙값 함수 정의: central.value 함수는 입력된 값의 타입에 따라 결측값을 채울 중앙값을 결정합니다.
* 숫자형이면 중앙값(median)
* 범주형이면 최빈값(mode)
패키지 로드: cluster 패키지를 로드하여 daisy 함수를 사용합니다.
거리 행렬 계산: daisy 함수를 사용하여 x-행렬의 모든 쌍 간의 거리 행렬을 계산합니다.
결측값 채우기: 결측값이 있는 행을 반복하면서 결측값을 k개의 최근접 이웃의 중앙값으로 채웁니다.
'''

# Load the CSV file
file_path = 'data.csv'
data = pd.read_csv(file_path)

def central_value(column):
    if column.dtype.kind in 'biufc':  # check if the column is of numeric type
        return np.nanmedian(column)
    else:
        return column.dropna().mode()[0]

def knn_impute(data, k=10):
    # Convert categorical variables to numerical values for KNN
    categorical_columns = data.select_dtypes(include=['object']).columns
    data_encoded = data.copy()
    for col in categorical_columns:
        data_encoded[col], _ = pd.factorize(data_encoded[col])
        data_encoded[col] = data_encoded[col].astype(float)
    
    # Apply KNN imputation
    imputer = KNNImputer(n_neighbors=k)
    data_imputed = imputer.fit_transform(data_encoded)
    data_imputed = pd.DataFrame(data_imputed, columns=data.columns)
    
    # Restore original categorical values where possible
    for col in categorical_columns:
        reverse_factor = dict(enumerate(data[col].astype('category').cat.categories))
        data_imputed[col] = data_imputed[col].round().astype(int).map(reverse_factor)
    
    # Fill remaining NaNs with central values
    for col in data.columns:
        if data_imputed[col].isnull().any():
            data_imputed[col] = data_imputed[col].fillna(central_value(data[col]))
    
    return data_imputed

data_imputed = knn_impute(data)
data_imputed.head()

# 문자 데이터는 제거 or 라벨링
columns_to_drop = ['position', 'win_score', 'loose_score', 'match_report', 'opponent', 'team', 'round', 'comp', 'dayofweek']
data_cleaned = data_imputed.drop(columns=columns_to_drop)

data_cleaned['venue'] = data_imputed['venue'].map({'Home': 1, 'Away': 0, 'Neutral': 2})
data_cleaned['game_started'] = data_imputed['game_started'].map({'N': 0, 'Y': 1, 'Y*': 1})

data_cleaned.to_csv('data2.csv', index=False, encoding='utf8')

In [5]:
# result 값과 숫자 데이터로 라벨링한 game_started 값이 통계적으로 유의미한지 파악하였다 p-value 값을 통해 그렇지 않다는 것을 알 수 있음
file_path = 'data2.csv'
data = pd.read_csv(file_path)

X_game_started = data[['game_started']]
y_result = data['result']

X_game_started = sm.add_constant(X_game_started)

model_game_started = sm.OLS(y_result, X_game_started).fit()

model_game_started_summary = model_game_started.summary()
model_game_started_summary

0,1,2,3
Dep. Variable:,result,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.37
Date:,"Wed, 19 Jun 2024",Prob (F-statistic):,0.242
Time:,23:45:14,Log-Likelihood:,-480.77
No. Observations:,411,AIC:,965.5
Df Residuals:,409,BIC:,973.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5116,0.049,10.518,0.000,0.416,0.607
game_started,-0.0933,0.080,-1.171,0.242,-0.250,0.063

0,1,2,3
Omnibus:,63.286,Durbin-Watson:,1.501
Prob(Omnibus):,0.0,Jarque-Bera (JB):,82.198
Skew:,-1.066,Prob(JB):,1.42e-18
Kurtosis:,2.495,Cond. No.,2.43


In [6]:
# result 값과 숫자 데이터로 라벨링한 venue 값이 통계적으로 유의미한지 파악하였다 p-value 값을 통해 유의미하다는 것을 알 수 있음
file_path = 'data2.csv'
data = pd.read_csv(file_path)

X_venue = data[['venue']]
y_result = data['result']

# Add a constant term to the predictors to include the intercept in the model
X_venue = sm.add_constant(X_venue)

# Fit the linear regression model
model_venue = sm.OLS(y_result, X_venue).fit()

# Get the model summary
model_venue_summary = model_venue.summary()
model_venue_summary

0,1,2,3
Dep. Variable:,result,R-squared:,0.031
Model:,OLS,Adj. R-squared:,0.029
Method:,Least Squares,F-statistic:,13.12
Date:,"Wed, 19 Jun 2024",Prob (F-statistic):,0.000329
Time:,23:45:14,Log-Likelihood:,-474.97
No. Observations:,411,AIC:,953.9
Df Residuals:,409,BIC:,962.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5994,0.051,11.782,0.000,0.499,0.699
venue,-0.2387,0.066,-3.622,0.000,-0.368,-0.109

0,1,2,3
Omnibus:,59.601,Durbin-Watson:,1.529
Prob(Omnibus):,0.0,Jarque-Bera (JB):,81.888
Skew:,-1.08,Prob(JB):,1.65e-18
Kurtosis:,2.662,Cond. No.,2.34


In [7]:
# result 값과 venue 값의 가중치에 맞춰 venue 데이터 값을 수정함
mean_result_by_venue = data.groupby('venue')['result'].mean()
print(mean_result_by_venue)

venue_weights = {0: 0.608, 1: 0.339, 2: 0.235}
data['venue'] = data['venue'].map(venue_weights)

X_weighted_venue = data[['venue']]
y_result = data['result']

X_weighted_venue = sm.add_constant(X_weighted_venue)

model_weighted_venue = sm.OLS(y_result, X_weighted_venue).fit()

model_weighted_venue_summary = model_weighted_venue.summary()
model_weighted_venue_summary

data.to_csv('data2.csv', index=False, encoding='utf8')

venue
0    0.608295
1    0.338983
2    0.235294
Name: result, dtype: float64
