In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

!pip install lightgbm scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2


In [2]:
from google.colab import files
uploaded = files.upload()

Saving track2_test_participant.csv to track2_test_participant.csv
Saving track2_train_participant.csv to track2_train_participant.csv


In [3]:
train = pd.read_csv('track2_train_participant.csv')
test = pd.read_csv('track2_test_participant.csv')

In [4]:
# 1) 문자열로 된 "0.6  < = t < = 0.7" 같은 구간에서 숫자만 뽑아주는 함수
def parse_range(s):
    nums = re.findall(r"[\d\.]+", s)
    return float(nums[0]), float(nums[1])

# 2) 두 그룹(GI 외, GI 오일링)에 대한 암페어 매트릭스를 숫자구간으로 정의
amp_matrix = {
    'GI 외': {
        (0.4, 0.5):    {(800,1850):200},
        (0.5, 0.6):    {(800,1600):200, (1600,1700):210, (1700,1800):220, (1800,1850):230},
        (0.6, 0.7):    {(800,1400):200, (1400,1500):220, (1500,1600):230, (1600,1700):250, (1700,1800):260, (1800,1850):270},
        (0.7, 0.8):    {(800,1200):200, (1200,1300):220, (1300,1400):230, (1400,1500):250, (1500,1600):260, (1600,1700):290, (1700,1800):300, (1800,1850):310},
        (0.8, 0.9):    {(800,1000):200, (1000,1100):210, (1100,1200):230, (1200,1300):250, (1300,1400):270, (1400,1500):280, (1500,1600):290, (1600,1700):310, (1700,1800):330, (1800,1850):340},
        (0.9, 1.0):    {(800,900):200, (900,1000):210, (1000,1100):230, (1100,1200):250, (1200,1300):270, (1300,1400):290, (1400,1500):310, (1500,1600):320, (1600,1700):340, (1700,1800):360, (1800,1850):370},
        (1.0, 1.1):    {(800,900):210, (900,1000):230, (1000,1100):250, (1100,1200):270, (1200,1300):290, (1300,1400):320, (1400,1500):340, (1500,1600):350, (1600,1700):360, (1700,1800):390, (1800,1850):400},
        (1.1, 1.2):    {(800,900):230, (900,1000):240, (1000,1100):280, (1100,1200):290, (1200,1300):310, (1300,1400):330, (1400,1500):350, (1500,1600):360, (1600,1700):390, (1700,1800):410, (1800,1850):420},
        (1.2, 1.3):    {(800,900):250, (900,1000):250, (1000,1100):300, (1100,1200):310, (1200,1300):320, (1300,1400):340, (1400,1500):370, (1500,1600):380, (1600,1700):400, (1700,1800):420, (1800,1850):430},
        (1.3, 1.4):    {(800,900):270, (900,1000):270, (1000,1100):310, (1100,1200):320, (1200,1300):330, (1300,1400):350, (1400,1500):380, (1500,1600):390, (1600,1700):410, (1700,1800):430, (1800,1850):430},
        (1.4, 1.5):    {(800,900):280, (900,1000):290, (1000,1100):320, (1100,1200):340, (1200,1300):350, (1300,1400):360, (1400,1500):390, (1500,1600):420, (1600,1700):430, (1700,1800):430, (1800,1850):430},
        (1.5, 1.6):    {(800,900):300, (900,1000):310, (1000,1100):330, (1100,1200):350, (1200,1300):360, (1300,1400):390, (1400,1500):410, (1500,1600):430, (1600,1700):430, (1700,1800):430, (1800,1850):430},
        (1.6, 1.7):    {(800,900):310, (900,1000):320, (1000,1100):340, (1100,1200):360, (1200,1300):380, (1300,1400):410, (1400,1500):430, (1500,1600):430, (1600,1700):430, (1700,1800):430, (1800,1850):430},
        (1.7, 1.8):    {(800,900):330, (900,1000):340, (1000,1100):350, (1100,1200):370, (1200,1300):400, (1300,1850):430},
        (1.8, 1.9):    {(800,900):340, (900,1100):360, (1100,1200):390, (1200,1850):430},
        (1.9, 2.0):    {(800,900):350, (900,1100):380, (1100,1200):410, (1200,1850):430}
    },
    'GI 오일링': {
        (0.6, 0.7):    {(800,1100):200, (1100,1200):210, (1200,1300):220, (1300,1400):240, (1400,1500):260, (1500,1600):280, (1600,1700):300, (1700,1800):320},
        (0.7, 0.8):    {(800,900):200, (900,1000):200, (1000,1100):220, (1100,1200):240, (1200,1300):260, (1300,1400):280, (1400,1500):300, (1500,1600):320, (1600,1700):330, (1700,1800):340},
        (0.8, 0.9):    {(800,900):200, (900,1000):220, (1000,1100):250, (1100,1200):270, (1200,1300):290, (1300,1400):320, (1400,1500):340, (1500,1600):360, (1600,1700):370, (1700,1800):380},
        (0.9, 1.0):    {(800,900):220, (900,1000):250, (1000,1100):280, (1100,1200):300, (1200,1300):330, (1300,1500):360, (1500,1600):370, (1600,1700):380, (1700,1800):390},
        (1.0, 1.1):    {(800,900):250, (900,1000):280, (1000,1100):300, (1100,1200):330, (1200,1300):360, (1300,1500):370, (1500,1600):380, (1600,1700):400, (1700,1800):430},
        (1.1, 1.2):    {(800,900):270, (900,1000):300, (1000,1100):330, (1100,1200):370, (1200,1400):370, (1400,1500):390, (1500,1600):410, (1600,1700):440},
        (1.2, 1.3):    {(800,900):290, (900,1000):330, (1000,1100):340, (1100,1200):370, (1200,1300):380, (1300,1400):390, (1400,1500):420, (1500,1700):450},
        (1.3, 1.4):    {(800,900):320, (900,1000):350, (1000,1100):360, (1100,1200):390, (1200,1300):390, (1300,1400):420, (1400,1700):450},
        (1.4, 1.5):    {(800,900):340, (900,1000):360, (1000,1100):370, (1100,1200):400, (1200,1300):420, (1300,1700):450},
        (1.5, 1.6):    {(800,900):370, (900,1000):370, (1000,1100):380, (1100,1200):410, (1200,1300):450, (1300,1700):450},
        (1.6, 1.7):    {(800,900):370, (900,1000):380, (1000,1100):400, (1100,1200):440, (1200,1300):450, (1300,1700):450},
        (1.7, 1.8):    {(800,900):370, (900,1000):390, (1000,1100):420, (1100,1200):450, (1200,1700):450},
        (1.8, 1.9):    {(800,900):380, (900,1000):410, (1000,1100):450, (1100,1200):450, (1200,1400):450, (1400,1700):450},
        (1.9, float('inf')):{(800,900):390, (900,1000):430, (1000,1100):450, (1100,1200):450, (1200,1400):450, (1400,1700):450}
    }
}

In [5]:
import re

def check_invalid_size_format(df):
    pattern = re.compile(r'^\s*\d+(\.\d+)?\s*x\s*\d+(\.\d+)?\s*x\s*[Cc]\s*$')  # 예: 123x456xC 또는 12.5x45.3xC

    for col in ['Size', 'Size.1']:
        if col in df.columns:
            invalid_values = df[~df[col].astype(str).str.match(pattern)][col].unique()
            if len(invalid_values) > 0:
                print(f"열 '{col}'에서 형식이 다른 값들:")
                print(invalid_values)
            else:
                print(f"열 '{col}'은 모두 올바른 형식입니다.")
        else:
            print(f"열 '{col}'이(가) 데이터프레임에 존재하지 않습니다.")

check_invalid_size_format(train)
check_invalid_size_format(test)

print(train[['Size', 'Size.1']].head())

열 'Size'은 모두 올바른 형식입니다.
열 'Size.1'은 모두 올바른 형식입니다.
열 'Size'은 모두 올바른 형식입니다.
열 'Size.1'은 모두 올바른 형식입니다.
                 Size              Size.1
0   1.49 x 1253.2 x C   1.49 x 1253.2 x C
1     1.178 x 942 x C     1.178 x 924 x C
2  4.018 x 1268.6 x C    4.018 x 1251 x C
3  1.541 x 1221.6 x C  1.541 x 1221.6 x C
4  0.807 x 1545.2 x C    0.807 x 1530 x C


In [6]:
def convert_size_columns(df):
    df = df.copy()
    for col in ['Size', 'Size.1']:
        if col in df.columns:
            # 공백 제거
            cleaned = df[col].astype(str).str.replace(" ", "", regex=False)
            # 정규식으로 숫자 추출
            extracted = cleaned.str.extract(r'^(\d+\.?\d*)x(\d+\.?\d*)x[Cc]$')
            # float 변환 및 곱하기
            area = extracted[0].astype(float) * extracted[1].astype(float)
            df[col] = area
    return df

train = convert_size_columns(train)
test = convert_size_columns(test)

print(train[['Size', 'Size.1']].head())

print("Size 열의 NaN 개수:", train['Size'].isnull().sum())
print("Size.1 열의 NaN 개수:", train['Size.1'].isnull().sum())

        Size     Size.1
0  1867.2680  1867.2680
1  1109.6760  1088.4720
2  5097.2348  5026.5180
3  1882.4856  1882.4856
4  1246.9764  1234.7100
Size 열의 NaN 개수: 0
Size.1 열의 NaN 개수: 0


In [7]:
from sklearn.linear_model import LinearRegression

def fill_length_with_weight(df):
    df = df.copy()

    # 0을 NaN으로 변경
    df['소재길이'] = df['소재길이'].replace(0, np.nan)

    # 회귀 학습용 데이터 준비 (둘 다 값이 있는 경우만)
    valid1 = df[['소재길이', '소재중량']].dropna()
    X1_train = valid1[['소재중량']]
    y1_train = valid1['소재길이']

    # 선형 회귀 모델 학습
    model1 = LinearRegression()
    model1.fit(X1_train, y1_train)

    # 결측치 채우기
    missing = df['소재길이'].isna()
    df.loc[missing, '소재길이'] = model1.predict(df.loc[missing, ['소재중량']])

    return df

train = fill_length_with_weight(train)
print("소재길이 열의 NaN 개수:", train['소재길이'].isnull().sum())

소재길이 열의 NaN 개수: 0


In [8]:
def preprocess(df):
    df = df.copy()
    # 결측치 처리
    most_freq = df['도유'].mode()[0]
    df['도유'].fillna(most_freq, inplace=True)
    df['강종'].fillna('일반', inplace=True)
    for col in ['장력1','스피드1','장력2','스피드2','장력3','스피드3']:
        df[col].fillna(0, inplace=True)

    # 기준 암페어 계산
    def get_amp(r):
        lo_t, hi_t = parse_range(r['두께그룹'])
        lo_w, hi_w = parse_range(r['폭그룹'])
        mat = amp_matrix.get(r['품명그룹'], {})
        for (t_lo, t_hi), wmap in mat.items():
            if lo_t >= t_lo and hi_t <= t_hi:
                for (w_lo, w_hi), amp in wmap.items():
                    if lo_w >= w_lo and hi_w <= w_hi:
                        return amp
        return None

    df['기준_암페어'] = df.apply(get_amp, axis=1)
    df['기준_암페어'].fillna(0, inplace=True)
    df['가공불가_플래그'] = (df['기준_암페어'] == 0).astype(int)

    # 불필요 컬럼 제거
    drop_cols = ['OK','NG','라인','작업장','소유','유형','수량','제품구분','수요가','월','일자','No.','제품번호', '폭그룹', '두께그룹', '마진그룹', '생산지시번호','소재번호']
    df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

    # 범주형 인코딩
    # for c in df.select_dtypes(include=['object']).columns:
    #     df[c] = df[c].astype('category')
    # cat_cols = df.select_dtypes(include=['category']).columns.tolist()
    # oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    # df[cat_cols] = oe.fit_transform(df[cat_cols])
    # df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

    # df["장력_스피드_ratio"] = df["장력"] / (df["스피드"] + 1)
    # df["장력1_장력2_diff"] = df["장력1"] - df["장력2"]
    # df["장력1x스피드1"] = df["장력1"] * df["스피드1"]
    # df["두께x소재폭"] = df["두께"] * df["소재폭"]

    # df["생산년도"] = df["생산일"].astype(str).str[:4].astype(int)
    # df["생산월"] = df["생산일"].astype(str).str[4:6].astype(int)
    # df["생산월_sin"] = np.sin(2 * np.pi * df["생산월"] / 12)
    # df["생산월_cos"] = np.cos(2 * np.pi * df["생산월"] / 12)

    # df["강종별_기준장력평균차"] = df["기준장력"] - df.groupby("강종")["기준장력"].transform("mean")
    # df["품명그룹_단중비율"] = df["단중"] / (df.groupby("품명그룹")["단중"].transform("mean") + 1)

    # df["log_소재중량"] = np.log1p(df["소재중량"])
    # df["log_장력"] = np.log1p(df["장력"])

    # df["장력_std"] = df[["장력1", "장력2", "장력3"]].std(axis=1)
    # df["스피드_max"] = df[["스피드1", "스피드2", "스피드3"]].max(axis=1)

    df["장력1_기준장력_ratio"] = df["장력1"] / (df["기준장력"] + 1)

    # 2. 장력 평균, 표준편차
    df["장력_avg"] = df[["장력1", "장력2", "장력3"]].mean(axis=1)
    df["장력_std"] = df[["장력1", "장력2", "장력3"]].std(axis=1)

    # 3. 스피드1 대비 장력1 비율 + 곱
    df["스피드1_장력1_ratio"] = df["스피드1"] / (df["장력1"] + 1)
    df["장력x스피드1"] = df["장력1"] * df["스피드1"]

    # 4. 생산월을 주기형 변수로 변환
    df["생산월"] = df["생산일"].astype(str).str[4:6].astype(int)
    df["생산월_sin"] = np.sin(2 * np.pi * df["생산월"] / 12)
    df["생산월_cos"] = np.cos(2 * np.pi * df["생산월"] / 12)

    # 5. 단중 / 소재폭 비율
    df["단중_소재폭_ratio"] = df["단중"] / (df["소재폭"] + 1)

    # 6. 소재중량 로그 변환
    df["log_소재중량"] = np.log1p(df["소재중량"])

    # 7. 강종별 기준장력 평균과의 차이
    df["강종별_기준장력평균차"] = df["기준장력"] - df.groupby("강종")["기준장력"].transform("mean")

    return df

In [9]:
# 전처리 적용
X_train = preprocess(train)
X_test  = preprocess(test)

# train/test 컬럼 정렬
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
y = train['OK']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['도유'].fillna(most_freq, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['강종'].fillna('일반', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves a

In [10]:
print(X_train.isnull().sum()[X_train.isnull().sum() > 0])
print(X_test.isnull().sum()[X_test.isnull().sum() > 0])



X_train['강종'] = X_train['강종'].astype('category')
X_train['품명그룹'] = X_train['품명그룹'].astype('category')
X_train['품명'] = X_train['품명'].astype('category')
X_train['재질'] = X_train['재질'].astype('category')
X_train['도유'] = X_train['도유'].astype('category')

X_test['강종'] = X_test['강종'].astype('category')
X_test['품명그룹'] = X_test['품명그룹'].astype('category')
X_test['품명'] = X_test['품명'].astype('category')
X_test['재질'] = X_test['재질'].astype('category')
X_test['도유'] = X_test['도유'].astype('category')


Series([], dtype: int64)
Series([], dtype: int64)


In [11]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import f1_score, roc_auc_score
# from lightgbm import LGBMClassifier
# import numpy as np

# def evaluate_lgbm(X, y, cv=5):
#     model = LGBMClassifier()
#     f1_scores = []
#     roc_scores = []

#     skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

#     for train_idx, val_idx in skf.split(X, y):
#         X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         model.fit(X_train, y_train, categorical_feature=['강종','품명그룹', '품명', '재질', '도유'])
#         y_pred = model.predict(X_val)
#         y_prob = model.predict_proba(X_val)[:, 1]

#         f1 = f1_score(y_val, y_pred)
#         roc = roc_auc_score(y_val, y_prob)

#         f1_scores.append(f1)
#         roc_scores.append(roc)

#     f1_mean = np.mean(f1_scores)
#     roc_mean = np.mean(roc_scores)

#     weighted_score = (0.4 * f1_mean + 0.6 * roc_mean) ** 2
#     print(f'LightGBM: {weighted_score:.5f}')
#     print(f'F1 Score: {f1_mean:.5f}')
#     print(f'ROC AUC Score: {roc_mean:.5f}')

# evaluate_lgbm(X_train, y)

In [12]:
# # 모델 학습 및 예측
# model = LGBMClassifier(random_state=42)
# model.fit(X_train, y)

# y_pred_test = model.predict(X_test)

# #파일 다운로드
# X_train.to_csv('X_train.csv', index=False)
# from google.colab import files
# files.download('X_train.csv')

In [13]:
# submission = pd.DataFrame({
#     'id': range(len(y_pred_test)),
#     'OK': (y_pred_test == 1).astype(int),
#     'NG': (y_pred_test == 0).astype(int)
# })
# submission.to_csv('submission.csv', index=False)


# # Colab에서 다운로드
# from google.colab import files
# files.download('submission.csv')

In [14]:
# def apply_post_rules(X_test, y_pred_test):
#     y_pred_test = y_pred_test.copy()

#     # 조건 정의
#     condition = (
#         (X_test['장력1'] == 0) | (X_test['장력1'].isnull()) |
#         (X_test['스피드1'] == 0) | (X_test['스피드1'].isnull()) |
#         (X_test['장력2'] == 0) | (X_test['장력2'].isnull()) |
#         (X_test['스피드2'] == 0) | (X_test['스피드2'].isnull()) |
#         (X_test['장력3'] == 0) | (X_test['장력3'].isnull()) |
#         (X_test['스피드3'] == 0) | (X_test['스피드3'].isnull())
#     )

#     if '장력' in X_test.columns:
#         condition |= (X_test['장력'] == 0)
#     if '스피드' in X_test.columns:
#         condition |= (X_test['스피드'] == 0)
#     if '품명_FH' in X_test.columns:
#         condition |= (X_test['품명_FH'] == 1)

#     # 조건 만족하는 경우 무조건 불량품 (OK = 0) 처리
#     y_pred_test[condition] = 0

#     return y_pred_test

In [15]:
# y_pred_test = model.predict(X_test)

# # #파일 다운로드
# # X_train.to_csv('X_train.csv', index=False)
# # from google.colab import files
# # files.download('X_train.csv')

# submission = pd.DataFrame({
#     'id': range(len(y_pred_test)),
#     'OK': (y_pred_test == 1).astype(int),
#     'NG': (y_pred_test == 0).astype(int)
# })
# submission.to_csv('submission.csv', index=False)


# # Colab에서 다운로드
# from google.colab import files
# files.download('submission.csv')

In [16]:
# from google.colab import files
# import pandas as pd

# # 파일 업로드
# uploaded = files.upload()

# # 업로드된 파일 이름 추출
# file_names = list(uploaded.keys())
# if len(file_names) != 2:
#     raise ValueError("CSV 파일 2개를 업로드해주세요.")

# # 파일 불러오기
# df1 = pd.read_csv(file_names[0])
# df2 = pd.read_csv(file_names[1])

# # id 기준 정렬 (순서 차이 방지)
# df1 = df1.sort_values('id').reset_index(drop=True)
# df2 = df2.sort_values('id').reset_index(drop=True)

# # OK 값이 다른 행 찾기
# diff_mask = df1['OK'] != df2['OK']
# diff_ids = df1.loc[diff_mask, 'id']

# # 결과 출력
# print("OK 값이 다른 샘플 ID 목록:")
# print(diff_ids.tolist())
# print(f"총 {len(diff_ids)}개의 샘플에서 OK 값이 다릅니다.")

In [17]:
# from google.colab import files
# import pandas as pd

# # 파일 업로드
# uploaded = files.upload()

# # 업로드된 파일 이름 추출
# file_names = list(uploaded.keys())
# if len(file_names) != 2:
#     raise ValueError("CSV 파일 2개를 업로드해주세요.")

# # 파일 불러오기
# df1 = pd.read_csv(file_names[0])
# df2 = pd.read_csv(file_names[1])

# # id 기준 정렬 (순서 차이 방지)
# df1 = df1.sort_values('id').reset_index(drop=True)
# df2 = df2.sort_values('id').reset_index(drop=True)

# # OK 값이 다른 행 찾기
# diff_mask = df1['OK'] != df2['OK']
# diff_ids = df1.loc[diff_mask, 'id']

# # 결과 출력
# print("OK 값이 다른 샘플 ID 목록:")
# print(diff_ids.tolist())
# print(f"총 {len(diff_ids)}개의 샘플에서 OK 값이 다릅니다.")

In [18]:
# # Select only numeric columns for correlation calculation
# numeric_df = df.select_dtypes(include=np.number)

# # Calculate and print correlation for numeric columns
# print(numeric_df.corr()['target'].sort_values(ascending=False).head(10))

###TABNET

In [19]:
from sklearn.preprocessing import LabelEncoder

# 모든 범주형 변수에 대해 label encoding
for col in ['강종', '품명그룹', '품명', '재질', '도유']:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.fit_transform(X_test[col])

In [20]:
# from xgboost import XGBClassifier
# xgb = XGBClassifier(**xgb_best_params)
# xgb.fit(X_train, y)
# xgb_pred_proba = xgb.predict_proba(X_test)[:, 1]

In [22]:
# import lightgbm as lgb
# from sklearn.metrics import accuracy_score

# ⬇ 여기에 너가 찾은 최적 파라미터를 넣어
# best_params = {
#     'num_leaves': 150,
#     'max_depth': 15,
#     'learning_rate': 0.14443508046904366,
#     'min_child_samples': 29,
#     'subsample': 0.9142372374246703,
#     'colsample_bytree': 1.0,
#     'reg_alpha': 0.016064982198031878,
#     'reg_lambda': 0.41465475681763364,
#     'random_state': 42
# }


# model = lgb.LGBMClassifier(**best_params)

# model.fit(X_train, y)
# lgbm_pred_proba = model.predict_proba(X_test)[:, 1]

In [None]:
# # 단순 평균 앙상블
# ensemble_proba = (lgbm_pred_proba + xgb_pred_proba) / 2

# # 또는 가중 평균 앙상블
# ensemble_proba = 0.6 * lgbm_pred_proba + 0.4 * xgb_pred_proba

In [None]:
# import numpy as np
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import f1_score, roc_auc_score
# from lightgbm import LGBMClassifier
# from xgboost import XGBClassifier

# # 준비
# X_np = X_train.copy()
# y_np = y.loc[X_train.index].copy()
# n_samples = X_np.shape[0]

# # OOF 예측 확률 초기화
# oof_lgb = np.zeros(n_samples)
# oof_xgb = np.zeros(n_samples)

# # K-Fold 설정
# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # Fold 반복
# for fold, (train_idx, valid_idx) in enumerate(kf.split(X_np, y_np)):
#     print(f"\n🚀 Fold {fold+1}")

#     X_train, X_valid = X_np[train_idx], X_np[valid_idx]
#     y_train, y_valid = y_np[train_idx], y_np[valid_idx]

#     # 1️⃣ LightGBM 모델
#     lgb_model = LGBMClassifier(
#         **best_params,  # 너가 튜닝한 파라미터

#     )
#     lgb_model.fit(X_train, y_train)
#     oof_lgb[valid_idx] = lgb_model.predict_proba(X_valid)[:, 1]

#     # 2️⃣ XGBoost 모델
#     xgb_model = XGBClassifier(
#         **xgb_best_params,  # 너가 튜닝한 파라미터
#         use_label_encoder=False,
#         eval_metric='logloss',
#         random_state=42
#     )
#     xgb_model.fit(X_train, y_train)
#     oof_xgb[valid_idx] = xgb_model.predict_proba(X_valid)[:, 1]

# # 3️⃣ 앙상블 (단순 평균)
# oof_ensemble = (oof_lgb + oof_xgb) / 2
# y_pred_final = (oof_ensemble > 0.5).astype(int)

# # 4️⃣ 성능 평가
# f1 = f1_score(y_np, y_pred_final)
# auc = roc_auc_score(y_np, oof_ensemble)
# combined = 0.4* f1 + 0.6 * auc

# print("\n🎯 앙상블 OOF 최종 점수")
# print(f"F1: {f1:.4f}")
# print(f"AUC: {auc:.4f}")
# print(f"혼합 점수 (0.4 * F1 + 0.6 * AUC): {combined:.4f}")

In [None]:
# import numpy as np
# from lightgbm import LGBMClassifier
# from xgboost import XGBClassifier
# from sklearn.model_selection import StratifiedKFold

# # X: 전체 학습 feature, y: 전체 학습 정답
# # X_test: 예측하고 싶은 테스트셋 (DataFrame)

# X_np = X_train.copy()
# y_np = y.copy()
# X_test_np = X_test

# # 앙상블 예측을 저장할 공간
# test_preds_lgb = np.zeros(X_test_np.shape[0])
# test_preds_xgb = np.zeros(X_test_np.shape[0])

# # K-Fold 설정
# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# for fold, (train_idx, valid_idx) in enumerate(kf.split(X_np, y_np)):
#     print(f"\n📦 Fold {fold+1} - 모델 학습 및 테스트셋 예측")

#     X_train, y_train = X_np[train_idx], y_np[train_idx]

#     # LGBM
#     lgb_model = LGBMClassifier(**best_params, random_state=42)
#     lgb_model.fit(X_train, y_train)
#     test_preds_lgb += lgb_model.predict_proba(X_test_np)[:, 1] / kf.n_splits

#     # XGBoost
#     xgb_model = XGBClassifier(**xgb_best_params,
#                               use_label_encoder=False,
#                               eval_metric='logloss',
#                               random_state=42)
#     xgb_model.fit(X_train, y_train)
#     test_preds_xgb += xgb_model.predict_proba(X_test_np)[:, 1] / kf.n_splits

# # 최종 앙상블 (단순 평균)
# test_preds_ensemble = (test_preds_lgb + test_preds_xgb) / 2

# # 이진 분류 결과 생성
# test_final_preds = (test_preds_ensemble > 0.5).astype(int)

# # 결과 보기
# print("\n🎯 최종 테스트셋 예측 완료!")
# print(test_final_preds[:10])  # 앞부분만 확인

In [23]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from skopt import BayesSearchCV
from skopt.space import Real, Integer



# 3. LightGBM 모델 정의
lgbm = lgb.LGBMClassifier(random_state=42)

# 4. 튜닝할 파라미터 공간 정의
param_space = {
    'num_leaves': Integer(20, 150),
    'max_depth': Integer(3, 15),
    'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'min_child_samples': Integer(5, 50),
    'subsample': Real(0.5, 1.0),
    'colsample_bytree': Real(0.5, 1.0),
    'reg_alpha': Real(0.0, 1.0),
    'reg_lambda': Real(0.0, 1.0),
}

#### 베이지안 옵티마이저 코드

from sklearn.metrics import make_scorer, f1_score, roc_auc_score

def f1_auc_combined(y_true, y_pred_proba, **kwargs):
    y_pred = (y_pred_proba > 0.5).astype(int)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred_proba)
    return 0.4 * f1 + 0.6 * auc  # 가중 평균 (비율 조정 가능)

scorer = make_scorer(f1_auc_combined, needs_proba=True)
opt = BayesSearchCV(
    estimator=lgbm,
    search_spaces=param_space,
    n_iter=30,
    scoring=scorer,
    cv=3,
    random_state=42
)
'''
opt = BayesSearchCV(
    estimator=lgbm,
    search_spaces=param_space,
    n_iter=30,
    scoring='f1',   # 여기서 f1 하나만!
    cv=3,
    random_state=42,
    verbose=0
)
'''

# 6. 탐색 실행
opt.fit(X_train, y)

# 7. 결과 출력
print("Best Parameters:", opt.best_params_)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[LightGBM] [Info] Number of positive: 481, number of negative: 746
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000127 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6402
[LightGBM] [Info] Number of data points in the train set: 1227, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.392013 -> initscore=-0.438858
[LightGBM] [Info] Start training from score -0.438858
[LightGBM] [Info] Number of positive: 480, number of negative: 746
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6275
[LightGBM] [Info] Number of data points in the train set: 1226, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]:

In [24]:
from xgboost import XGBClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, make_scorer


# 2. 평가 함수 (F1 + AUC 혼합 점수)
def f1_auc_score(y_true, y_pred_proba, **kwargs):
    y_pred = (y_pred_proba > 0.5).astype(int)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred_proba)
    return 0.4 * f1 + 0.6 * auc

scorer = make_scorer(f1_auc_score, needs_proba=True)

# 3. 탐색할 파라미터 공간
param_space = {
    'n_estimators': Integer(100, 500),
    'max_depth': Integer(3, 10),
    'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'subsample': Real(0.6, 1.0),
    'colsample_bytree': Real(0.6, 1.0),
    'min_child_weight': Integer(1, 10),
    'gamma': Real(0, 5),
    'reg_alpha': Real(0.0, 1.0),
    'reg_lambda': Real(0.0, 1.0),
}

# 4. 모델 정의
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# 5. BayesSearchCV 실행
opt = BayesSearchCV(
    estimator=xgb,
    search_spaces=param_space,
    n_iter=30,
    scoring=scorer,
    cv=3,
    random_state=42,
    verbose=0
)

opt.fit(X_train, y)

# 6. 결과 출력
print("Best Parameters:")
print(opt.best_params_)

# 7. 최적 모델 저장
xgb_best_params = opt.best_params_
best_model = opt.best_estimator_

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters:
OrderedDict([('colsample_bytree', 0.9349553422213137), ('gamma', 4.416576386904311), ('learning_rate', 0.02806554771929606), ('max_depth', 10), ('min_child_weight', 9), ('n_estimators', 125), ('reg_alpha', 0.13830853827857517), ('reg_lambda', 0.3535873976284181), ('subsample', 0.8542916407516681)])


Parameters: { "use_label_encoder" } are not used.



In [25]:
import numpy as np
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

# 준비: numpy 변환
X_np = X_train
y_np = y
X_test_np = X_test

# OOF 예측 초기화
oof_lgb = np.zeros(X_np.shape[0])
oof_xgb = np.zeros(X_np.shape[0])

# 테스트 예측 누적값
test_preds_lgb = np.zeros(X_test_np.shape[0])
test_preds_xgb = np.zeros(X_test_np.shape[0])

# 5-Fold 설정
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(kf.split(X_np, y_np)):
    print(f"\n🚀 Fold {fold+1}")

    X_traindd, X_validdd = X_np.iloc[train_idx], X_np.iloc[valid_idx]
    y_traindd, y_validdd = y_np.iloc[train_idx], y_np.iloc[valid_idx]

    # 1️⃣ LightGBM
    lgb_model = LGBMClassifier(
        **opt.best_params_,

    )
    lgb_model.fit(X_traindd, y_traindd)
    oof_lgb[valid_idx] = lgb_model.predict_proba(X_validdd)[:, 1]
    test_preds_lgb += lgb_model.predict_proba(X_test_np)[:, 1] / kf.n_splits

    # 2️⃣ XGBoost
    xgb_model = XGBClassifier(
        **xgb_best_params,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )
    xgb_model.fit(X_traindd, y_traindd)
    oof_xgb[valid_idx] = xgb_model.predict_proba(X_validdd)[:, 1]
    test_preds_xgb += xgb_model.predict_proba(X_test_np)[:, 1] / kf.n_splits

# 앙상블 (단순 평균)
oof_ensemble = 0.8*oof_lgb + 0.2*oof_xgb
test_preds_ensemble = 0.8*test_preds_lgb + 0.2*test_preds_xgb

# OOF 최종 평가
y_pred_final = (oof_ensemble > 0.5).astype(int)
f1 = f1_score(y_np, y_pred_final)
auc = roc_auc_score(y_np, oof_ensemble)
combined = 0.4 * f1 + 0.6 * auc

print("\n✅ OOF 평가 결과")
print(f"F1: {f1:.4f}")
print(f"AUC: {auc:.4f}")
print(f"혼합 점수 (0.4*F1 + 0.6*AUC): {combined:.4f}")

# 테스트셋 최종 결과 (0/1)
test_final_preds = (test_preds_ensemble > 0.5).astype(int)
print("\n🎯 테스트셋 예측 결과 샘플:", test_final_preds[:10])

Parameters: { "use_label_encoder" } are not used.




🚀 Fold 1
[LightGBM] [Info] Number of positive: 577, number of negative: 895
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6513
[LightGBM] [Info] Number of data points in the train set: 1472, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.391984 -> initscore=-0.438981
[LightGBM] [Info] Start training from score -0.438981

🚀 Fold 2
[LightGBM] [Info] Number of positive: 577, number of negative: 895
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6562
[LightGBM] [Info] Number of data points in the train set: 1472, number of used features: 41
[LightGB

Parameters: { "use_label_encoder" } are not used.




🚀 Fold 3
[LightGBM] [Info] Number of positive: 577, number of negative: 895
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6520
[LightGBM] [Info] Number of data points in the train set: 1472, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.391984 -> initscore=-0.438981
[LightGBM] [Info] Start training from score -0.438981

🚀 Fold 4
[LightGBM] [Info] Number of positive: 577, number of negative: 895
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000281 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6571
[LightGBM] [Info] Number of data points in the train set: 1472, number of used features: 41
[LightGB

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




🚀 Fold 5
[LightGBM] [Info] Number of positive: 576, number of negative: 896
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000185 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6643
[LightGBM] [Info] Number of data points in the train set: 1472, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.391304 -> initscore=-0.441833
[LightGBM] [Info] Start training from score -0.441833

✅ OOF 평가 결과
F1: 0.9972
AUC: 1.0000
혼합 점수 (0.4*F1 + 0.6*AUC): 0.9989

🎯 테스트셋 예측 결과 샘플: [0 1 0 0 0 1 1 0 0 1]


Parameters: { "use_label_encoder" } are not used.



In [26]:
y_pred_test = test_final_preds

submission = pd.DataFrame({
    'id': range(len(y_pred_test)),
    'OK': (y_pred_test == 1).astype(int),
    'NG': (y_pred_test == 0).astype(int)
})
submission.to_csv('submission.csv', index=False)


# Colab에서 다운로드
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
from google.colab import files
import pandas as pd

# 파일 업로드
uploaded = files.upload()

# 업로드된 파일 이름 추출
file_names = list(uploaded.keys())
if len(file_names) != 2:
    raise ValueError("CSV 파일 2개를 업로드해주세요.")

# 파일 불러오기
df1 = pd.read_csv(file_names[0])
df2 = pd.read_csv(file_names[1])

# id 기준 정렬 (순서 차이 방지)
df1 = df1.sort_values('id').reset_index(drop=True)
df2 = df2.sort_values('id').reset_index(drop=True)

# OK 값이 다른 행 찾기
diff_mask = df1['OK'] != df2['OK']
diff_ids = df1.loc[diff_mask, 'id']

# 결과 출력
print("OK 값이 다른 샘플 ID 목록:")
print(diff_ids.tolist())
print(f"총 {len(diff_ids)}개의 샘플에서 OK 값이 다릅니다.")

Saving 1시1분.csv to 1시1분.csv
Saving 16일 최종제출.csv to 16일 최종제출.csv
OK 값이 다른 샘플 ID 목록:
[2, 17, 20, 41, 122, 389]
총 6개의 샘플에서 OK 값이 다릅니다.


In [None]:
#파일 다운로드
X_train.to_csv('X_train.csv', index=False)
from google.colab import files
files.download('X_train.csv')