In [1]:
import pandas as pd
import numpy as np
import glob

# 1. 파일 불러오기
pitcher_files = sorted(glob.glob("../kbodata_output/pitchers_20*.csv"))
scoreboard_files = sorted(glob.glob("../kbodata_output/scoreboards_20*.csv"))

pitchers = pd.concat([pd.read_csv(f) for f in pitcher_files], ignore_index=True)
scoreboards = pd.concat([pd.read_csv(f) for f in scoreboard_files], ignore_index=True)

# 2. 연도 정보 붙이기 (예: pitchers_2023.csv → 2023)
def extract_year_from_filename(path):
    return int(path.split("_")[-1].split(".")[0])

for f in pitcher_files:
    year = extract_year_from_filename(f)
    df = pd.read_csv(f)
    df["year"] = year
    pitchers = pd.concat([pitchers, df], ignore_index=True)

for f in scoreboard_files:
    year = extract_year_from_filename(f)
    df = pd.read_csv(f)
    df["year"] = year
    scoreboards = pd.concat([scoreboards, df], ignore_index=True)

In [2]:
import pandas as pd

# 1. 전처리: 이닝 문자열을 float으로 변환하는 함수
def convert_inning_str_to_float(s):
    try:
        s = str(s).zfill(2)
        full = int(s[:-1])
        partial = int(s[-1])
        return full + {0: 0.0, 1: 1/3, 2: 2/3}.get(partial, 0)
    except:
        return 0

# 2. IP 컬럼 생성
pitchers["IP"] = pitchers["inning"].apply(convert_inning_str_to_float)

# 3. RS 계산용 함수
def get_inning_range(ip_float):
    full = int(ip_float)
    partial = ip_float - full
    innings = list(range(1, full + 1))
    if partial >= 1/3:
        innings.append(full + 1)
    return innings

def calculate_rs(pitcher_df, scoreboard_df):
    rs_list = []
    for _, row in pitcher_df.iterrows():
        idx = row["idx"]
        team = row["team"]
        ip = row["IP"]

        innings = get_inning_range(ip)
        sb_row = scoreboard_df[scoreboard_df["idx"] == idx]
        if sb_row.empty:
            rs_list.append(None)
            continue

        sb = sb_row.iloc[0]
        team_name = team.strip()
        home = sb["home"].strip()
        away = sb["away"].strip()

        # 홈/원정 여부는 고려하지 않고 전체 득점으로 계산
        scores = []
        for i in innings:
            col = f"i_{i}"
            try:
                val = int(sb[col]) if sb[col] != "-" else 0
            except:
                val = 0
            scores.append(val)
        rs_list.append(sum(scores))
    pitcher_df["RS"] = rs_list
    return pitcher_df

# 4. RS 계산
pitchers_with_ip = pitchers[pitchers["IP"] > 0]
pitchers_with_rs = calculate_rs(pitchers_with_ip, scoreboards)

# 5. 시즌별 요약 테이블 생성
season_stats = pitchers_with_rs.groupby(["name", "year"]).agg({
    "IP": "sum",
    "RS": "sum",
    "result": lambda x: (x == "W").sum()  # 승수 계산
}).reset_index()

derived_stats = pitchers_with_rs.groupby(["name", "year"]).agg({
    "strikeout": "sum",
    "dead4ball": "sum",
    "earnedrun": "sum",
    "hitted": "sum",
    "homerun": "sum"
}).reset_index()

# 6. 파생 변수 계산
season_stats = pd.merge(season_stats, derived_stats, on=["name", "year"])

season_stats = season_stats.rename(columns={"result": "W"})

season_stats["ERA"] = season_stats["earnedrun"] * 9 / season_stats["IP"]
season_stats["WHIP"] = (season_stats["hitted"] + season_stats["dead4ball"]) / season_stats["IP"]
season_stats["FIP"] = (13 * season_stats["homerun"] + 3 * season_stats["dead4ball"] - 2 * season_stats["strikeout"]) / season_stats["IP"] + 3.2
season_stats["K9"] = season_stats["strikeout"] * 9 / season_stats["IP"]
season_stats["BB9"] = season_stats["dead4ball"] * 9 / season_stats["IP"]

# 8. 불필요한 컬럼 제거
season_stats = season_stats.drop(columns=["strikeout", "dead4ball", "earnedrun", "hitted", "homerun"])

# 9. 최종 확인
import seaborn as sns
import matplotlib.pyplot as plt

display(season_stats.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitcher_df["RS"] = rs_list


Unnamed: 0,name,year,IP,RS,W,ERA,WHIP,FIP,K9,BB9
0,0,2012,12.0,14,2,1.5,0.666667,5.2,3.0,1.5
1,가뇽,2020,319.333333,160,22,4.340292,1.471816,3.375365,7.947808,4.114823
2,가득염,2008,65.333333,56,8,3.857143,1.22449,3.842857,6.612245,2.755102
3,가득염,2009,16.0,20,0,4.5,1.25,5.575,3.375,4.5
4,가득염,2010,41.333333,24,0,4.354839,1.258065,4.216129,6.967742,3.919355


In [None]:
season_stats[season_stats["name"] == "양현종"].sort_values("year")

In [45]:
import pandas as pd

# 1. player_info_sample.csv 불러오기
player_info = pd.read_csv("../player_info_sample.csv")

# 2. 컬럼명 통일 (병합을 위해)
player_info = player_info.rename(columns={
    "player_name": "name",
    "season": "year"
})

# 3. year 컬럼을 정수형으로 변환
player_info["year"] = player_info["year"].astype(int)
season_stats["year"] = season_stats["year"].astype(int)

# 4. 두 데이터프레임 병합
merged_df = pd.merge(season_stats, player_info, on=["name", "year"], how="inner")

# 5. 결과 확인
# 병합된 데이터 상위 10개만 확인
print(merged_df.head(10))

  name  year          IP   RS   W       ERA      WHIP       FIP        K9  \
0  고영표  2021  359.333333  228  22  2.955473  1.135436  2.966234  7.063080   
1  고영표  2022  384.000000  162  26  3.328125  1.276042  2.663542  7.593750   
2  고영표  2023  387.333333  170  26  2.834768  1.218589  2.885026  5.901893   
3  김도현  2019   80.666667   52  10  4.685950  1.735537  5.208264  4.016529   
4  김도현  2020  129.333333   60   4  6.819588  1.902062  5.968041  5.706186   
5  김도현  2021   63.333333   38   0  6.821053  2.052632  5.726316  4.263158   
6  김도현  2022   39.333333   16   0  5.949153  1.474576  5.538983  5.491525   
7  박세웅  2019  120.000000   58   6  4.200000  1.483333  3.533333  6.600000   
8  박세웅  2020  294.666667  178  16  4.703620  1.574661  4.618552  6.597285   
9  박세웅  2021  340.000000  196  20  4.076471  1.282353  4.494118  6.882353   

        BB9 salary_next  fa  age  
0  2.053803      30,000   0   29  
1  1.828125      43,000   0   30  
2  1.580034     200,000   0   31  
3  5.801653 

### 앞으로 계속 사용할 df는 위에 두고, 연봉 로그 변환

In [19]:
# 기존 테이블을 복사 (원본 유지)
df_log = merged_df.copy()

# 'salary_next' 컬럼을 정수형으로 처리하고 로그 변환
df_log["salary_next_won"] = (
    df_log["salary_next"]
    .astype(str)
    .str.replace(",", "")
    .astype(float) * 10000
)

# 로그 변환된 컬럼 추가
import numpy as np
df_log["log_salary_next"] = np.log1p(df_log["salary_next_won"])

# ✅ 연도 더미 변수 생성 (year 컬럼 기준)
df_log = pd.get_dummies(df_log, columns=["year"], prefix="year", drop_first=True)

# 결과 확인
print(df_log.head())

  name          IP   RS   W       ERA      WHIP       FIP        K9       BB9  \
0  고영표  359.333333  228  22  2.955473  1.135436  2.966234  7.063080  2.053803   
1  고영표  384.000000  162  26  3.328125  1.276042  2.663542  7.593750  1.828125   
2  고영표  387.333333  170  26  2.834768  1.218589  2.885026  5.901893  1.580034   
3  김도현   80.666667   52  10  4.685950  1.735537  5.208264  4.016529  5.801653   
4  김도현  129.333333   60   4  6.819588  1.902062  5.968041  5.706186  6.680412   

  salary_next  fa  age  salary_next_won  log_salary_next  year_2020  \
0      43,000   0   30     4.300000e+08        19.879296      False   
1     200,000   0   31     2.000000e+09        21.416413      False   
2     200,000   0   32     2.000000e+09        21.416413      False   
3       3,700   0   20     3.700000e+07        17.426428      False   
4       4,100   0   21     4.100000e+07        17.529083       True   

   year_2021  year_2022  year_2023  
0       True      False      False  
1      False

In [14]:
import statsmodels.api as sm

# 사용할 설명 변수 (연봉 예측에 쓰이는 모든 수치형 변수)
features = [
    "W", "ERA", "IP", "RS", "K9", "BB9", "WHIP", "FIP", "age", "fa"
]

# 설명 변수(X), 종속 변수(y)
X = df_log[features]
X = sm.add_constant(X)
y = df_log["log_salary_next"]

# NaN 제거
valid_idx = X.notnull().all(axis=1) & y.notnull()
X_valid = X.loc[valid_idx]
y_valid = y.loc[valid_idx]

# 회귀 실행
model = sm.OLS(y_valid, X_valid).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:        log_salary_next   R-squared:                       0.817
Model:                            OLS   Adj. R-squared:                  0.764
Method:                 Least Squares   F-statistic:                     15.23
Date:                Mon, 02 Jun 2025   Prob (F-statistic):           7.79e-10
Time:                        11:41:12   Log-Likelihood:                -31.940
No. Observations:                  45   AIC:                             85.88
Df Residuals:                      34   BIC:                             105.8
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         18.1811      2.003      9.076      0.0

In [16]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 설명 변수만 추출
X_for_vif = X_valid.drop(columns=["const"])

# VIF 계산
vif_df = pd.DataFrame({
    "feature": X_for_vif.columns,
    "VIF": [variance_inflation_factor(X_for_vif.values, i) for i in range(X_for_vif.shape[1])]
})

print(vif_df.sort_values("VIF", ascending=False))

  feature         VIF
6    WHIP  306.274307
1     ERA  150.379444
7     FIP   92.611173
8     age   44.905748
2      IP   42.983651
5     BB9   33.344621
3      RS   31.342024
0       W   30.656414
4      K9   20.509324
9      fa    1.214420


In [20]:
# log(age) 파생 변수 생성
df_log["log_age"] = np.log(df_log["age"])

In [21]:
import statsmodels.api as sm
import pandas as pd

# 연도 더미 추가
df_log_dummies = pd.get_dummies(df_log, columns=["year"], prefix="year", drop_first=True)

# 사용할 변수들
features = [
    "W", "ERA", "IP", "K9", "BB9",
    "WHIP", "FIP", "log_age", "fa"
] + [col for col in df_log_dummies.columns if col.startswith("year_")]

X = df_log_dummies[features]
X = sm.add_constant(X)
y = df_log_dummies["log_salary_next"]

# 결측값 제거
valid_idx = X.notnull().all(axis=1) & y.notnull()
X_valid = X.loc[valid_idx].astype(float)
y_valid = y.loc[valid_idx].astype(float)

# 회귀 실행
model = sm.OLS(y_valid, X_valid).fit()
print(model.summary())

KeyError: "None of [Index(['year'], dtype='object')] are in the [columns]"

In [26]:
# 사용할 변수들
features = [
    "W", "ERA", "IP", "K9", "BB9",
    "WHIP", "FIP", "log_age", "fa"
] + [col for col in df_log.columns if col.startswith("year_")]

X = df_log[features]
X = sm.add_constant(X)
y = df_log["log_salary_next"]

valid_idx = X.notnull().all(axis=1) & y.notnull()
X_valid = X.loc[valid_idx].astype(float)
y_valid = y.loc[valid_idx].astype(float)

model = sm.OLS(y_valid, X_valid).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:        log_salary_next   R-squared:                       0.830
Model:                            OLS   Adj. R-squared:                  0.759
Method:                 Least Squares   F-statistic:                     11.67
Date:                Mon, 02 Jun 2025   Prob (F-statistic):           1.52e-08
Time:                        11:48:33   Log-Likelihood:                -30.304
No. Observations:                  45   AIC:                             88.61
Df Residuals:                      31   BIC:                             113.9
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         13.5135      3.185      4.242      0.0

In [27]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_vif = X_valid.drop(columns=["const"])

vif_df = pd.DataFrame({
    "feature": X_vif.columns,
    "VIF": [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
})

print(vif_df.sort_values("VIF", ascending=False))

      feature         VIF
5        WHIP  384.826929
7     log_age  226.440757
1         ERA  163.177197
6         FIP  130.946713
4         BB9   47.299192
3          K9   46.297975
2          IP   39.430901
0           W   24.575794
11  year_2022    3.607549
10  year_2021    2.864645
12  year_2023    2.491411
9   year_2020    2.469073
8          fa    1.314151


In [28]:
# 사용할 변수들
features = [
    "W", "IP",
    "FIP", "fa"
] + [col for col in df_log.columns if col.startswith("year_")]

X = df_log[features]
X = sm.add_constant(X)
y = df_log["log_salary_next"]

valid_idx = X.notnull().all(axis=1) & y.notnull()
X_valid = X.loc[valid_idx].astype(float)
y_valid = y.loc[valid_idx].astype(float)

model = sm.OLS(y_valid, X_valid).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:        log_salary_next   R-squared:                       0.651
Model:                            OLS   Adj. R-squared:                  0.573
Method:                 Least Squares   F-statistic:                     8.385
Date:                Mon, 02 Jun 2025   Prob (F-statistic):           2.38e-06
Time:                        11:49:07   Log-Likelihood:                -46.538
No. Observations:                  45   AIC:                             111.1
Df Residuals:                      36   BIC:                             127.3
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         19.2976      1.084     17.799      0.0

In [29]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_vif = X_valid.drop(columns=["const"])

vif_df = pd.DataFrame({
    "feature": X_vif.columns,
    "VIF": [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
})

print(vif_df.sort_values("VIF", ascending=False))

     feature        VIF
1         IP  31.906178
0          W  19.799288
2        FIP   6.821161
6  year_2022   2.940216
5  year_2021   2.666609
7  year_2023   2.344901
4  year_2020   2.233258
3         fa   1.123634


## 단순 회귀

In [52]:
import statsmodels.api as sm

# 설명변수와 종속변수 설정
X = df_log[["RS"]]
X = sm.add_constant(X)  # 상수항 추가
y = df_log["W"]

# 결측값 제거
valid_idx = X.notnull().all(axis=1) & y.notnull()
X_valid = X.loc[valid_idx].astype(float)
y_valid = y.loc[valid_idx].astype(float)

# 회귀 실행
model = sm.OLS(y_valid, X_valid).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      W   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.735
Method:                 Least Squares   F-statistic:                     123.0
Date:                Mon, 02 Jun 2025   Prob (F-statistic):           3.39e-14
Time:                        10:30:49   Log-Likelihood:                -132.32
No. Observations:                  45   AIC:                             268.6
Df Residuals:                      43   BIC:                             272.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.6667      1.668     -0.400      0.6

## 1단계 2차 OLS

In [23]:
import statsmodels.api as sm

# ✅ 연도 더미 컬럼을 포함한 설명 변수 구성
feature_cols = ["RS", "ERA", "IP"] + [col for col in df_log.columns if col.startswith("year_")]
X = df_log[feature_cols]

# ✅ 상수항 추가
X = sm.add_constant(X)

# ✅ 종속 변수
y = df_log["W"]

# ✅ 결측값 제거
valid_idx = X.notnull().all(axis=1) & y.notnull()
X_valid = X.loc[valid_idx].astype(float)
y_valid = y.loc[valid_idx].astype(float)

# ✅ 회귀 실행
model = sm.OLS(y_valid, X_valid).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      W   R-squared:                       0.840
Model:                            OLS   Adj. R-squared:                  0.810
Method:                 Least Squares   F-statistic:                     27.84
Date:                Mon, 02 Jun 2025   Prob (F-statistic):           6.61e-13
Time:                        11:46:05   Log-Likelihood:                -121.42
No. Observations:                  45   AIC:                             258.8
Df Residuals:                      37   BIC:                             273.3
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.4701      4.422      1.689      0.1