In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

In [2]:
from google.colab import files
uploaded = files.upload()

Saving track2_test_participant.csv to track2_test_participant (1).csv
Saving track2_train_participant.csv to track2_train_participant (1).csv


In [3]:
train = pd.read_csv('track2_train_participant.csv')
test = pd.read_csv('track2_test_participant.csv')

In [4]:
# 1) 문자열로 된 "0.6  < = t < = 0.7" 같은 구간에서 숫자만 뽑아주는 함수
def parse_range(s):
    nums = re.findall(r"[\d\.]+", s)
    return float(nums[0]), float(nums[1])

# 2) 두 그룹(GI 외, GI 오일링)에 대한 암페어 매트릭스를 숫자구간으로 정의
amp_matrix = {
    'GI 외': {
        (0.4, 0.5):    {(800,1850):200},
        (0.5, 0.6):    {(800,1600):200, (1600,1700):210, (1700,1800):220, (1800,1850):230},
        (0.6, 0.7):    {(800,1400):200, (1400,1500):220, (1500,1600):230, (1600,1700):250, (1700,1800):260, (1800,1850):270},
        (0.7, 0.8):    {(800,1200):200, (1200,1300):220, (1300,1400):230, (1400,1500):250, (1500,1600):260, (1600,1700):290, (1700,1800):300, (1800,1850):310},
        (0.8, 0.9):    {(800,1000):200, (1000,1100):210, (1100,1200):230, (1200,1300):250, (1300,1400):270, (1400,1500):280, (1500,1600):290, (1600,1700):310, (1700,1800):330, (1800,1850):340},
        (0.9, 1.0):    {(800,900):200, (900,1000):210, (1000,1100):230, (1100,1200):250, (1200,1300):270, (1300,1400):290, (1400,1500):310, (1500,1600):320, (1600,1700):340, (1700,1800):360, (1800,1850):370},
        (1.0, 1.1):    {(800,900):210, (900,1000):230, (1000,1100):250, (1100,1200):270, (1200,1300):290, (1300,1400):320, (1400,1500):340, (1500,1600):350, (1600,1700):360, (1700,1800):390, (1800,1850):400},
        (1.1, 1.2):    {(800,900):230, (900,1000):240, (1000,1100):280, (1100,1200):290, (1200,1300):310, (1300,1400):330, (1400,1500):350, (1500,1600):360, (1600,1700):390, (1700,1800):410, (1800,1850):420},
        (1.2, 1.3):    {(800,900):250, (900,1000):250, (1000,1100):300, (1100,1200):310, (1200,1300):320, (1300,1400):340, (1400,1500):370, (1500,1600):380, (1600,1700):400, (1700,1800):420, (1800,1850):430},
        (1.3, 1.4):    {(800,900):270, (900,1000):270, (1000,1100):310, (1100,1200):320, (1200,1300):330, (1300,1400):350, (1400,1500):380, (1500,1600):390, (1600,1700):410, (1700,1800):430, (1800,1850):430},
        (1.4, 1.5):    {(800,900):280, (900,1000):290, (1000,1100):320, (1100,1200):340, (1200,1300):350, (1300,1400):360, (1400,1500):390, (1500,1600):420, (1600,1700):430, (1700,1800):430, (1800,1850):430},
        (1.5, 1.6):    {(800,900):300, (900,1000):310, (1000,1100):330, (1100,1200):350, (1200,1300):360, (1300,1400):390, (1400,1500):410, (1500,1600):430, (1600,1700):430, (1700,1800):430, (1800,1850):430},
        (1.6, 1.7):    {(800,900):310, (900,1000):320, (1000,1100):340, (1100,1200):360, (1200,1300):380, (1300,1400):410, (1400,1500):430, (1500,1600):430, (1600,1700):430, (1700,1800):430, (1800,1850):430},
        (1.7, 1.8):    {(800,900):330, (900,1000):340, (1000,1100):350, (1100,1200):370, (1200,1300):400, (1300,1850):430},
        (1.8, 1.9):    {(800,900):340, (900,1100):360, (1100,1200):390, (1200,1850):430},
        (1.9, 2.0):    {(800,900):350, (900,1100):380, (1100,1200):410, (1200,1850):430}
    },
    'GI 오일링': {
        (0.6, 0.7):    {(800,1100):200, (1100,1200):210, (1200,1300):220, (1300,1400):240, (1400,1500):260, (1500,1600):280, (1600,1700):300, (1700,1800):320},
        (0.7, 0.8):    {(800,900):200, (900,1000):200, (1000,1100):220, (1100,1200):240, (1200,1300):260, (1300,1400):280, (1400,1500):300, (1500,1600):320, (1600,1700):330, (1700,1800):340},
        (0.8, 0.9):    {(800,900):200, (900,1000):220, (1000,1100):250, (1100,1200):270, (1200,1300):290, (1300,1400):320, (1400,1500):340, (1500,1600):360, (1600,1700):370, (1700,1800):380},
        (0.9, 1.0):    {(800,900):220, (900,1000):250, (1000,1100):280, (1100,1200):300, (1200,1300):330, (1300,1500):360, (1500,1600):370, (1600,1700):380, (1700,1800):390},
        (1.0, 1.1):    {(800,900):250, (900,1000):280, (1000,1100):300, (1100,1200):330, (1200,1300):360, (1300,1500):370, (1500,1600):380, (1600,1700):400, (1700,1800):430},
        (1.1, 1.2):    {(800,900):270, (900,1000):300, (1000,1100):330, (1100,1200):370, (1200,1400):370, (1400,1500):390, (1500,1600):410, (1600,1700):440},
        (1.2, 1.3):    {(800,900):290, (900,1000):330, (1000,1100):340, (1100,1200):370, (1200,1300):380, (1300,1400):390, (1400,1500):420, (1500,1700):450},
        (1.3, 1.4):    {(800,900):320, (900,1000):350, (1000,1100):360, (1100,1200):390, (1200,1300):390, (1300,1400):420, (1400,1700):450},
        (1.4, 1.5):    {(800,900):340, (900,1000):360, (1000,1100):370, (1100,1200):400, (1200,1300):420, (1300,1700):450},
        (1.5, 1.6):    {(800,900):370, (900,1000):370, (1000,1100):380, (1100,1200):410, (1200,1300):450, (1300,1700):450},
        (1.6, 1.7):    {(800,900):370, (900,1000):380, (1000,1100):400, (1100,1200):440, (1200,1300):450, (1300,1700):450},
        (1.7, 1.8):    {(800,900):370, (900,1000):390, (1000,1100):420, (1100,1200):450, (1200,1700):450},
        (1.8, 1.9):    {(800,900):380, (900,1000):410, (1000,1100):450, (1100,1200):450, (1200,1400):450, (1400,1700):450},
        (1.9, float('inf')):{(800,900):390, (900,1000):430, (1000,1100):450, (1100,1200):450, (1200,1400):450, (1400,1700):450}
    }
}

In [5]:
def preprocess(df):
    df = df.copy()
    # 결측치 처리
    most_freq = train['도유'].mode()[0]
    df['도유'].fillna(most_freq, inplace=True)
    df['강종'].fillna('일반', inplace=True)
    df['소재길이'].fillna(train['소재길이'].mean(), inplace=True)
    for col in ['장력1','스피드1','장력2','스피드2','장력3','스피드3']:
        df[col].fillna(train[col].median(), inplace=True)

    # 기준 암페어 계산
    def get_amp(r):
        lo_t, hi_t = parse_range(r['두께그룹'])
        lo_w, hi_w = parse_range(r['폭그룹'])
        mat = amp_matrix.get(r['품명그룹'], {})
        for (t_lo, t_hi), wmap in mat.items():
            if lo_t >= t_lo and hi_t <= t_hi:
                for (w_lo, w_hi), amp in wmap.items():
                    if lo_w >= w_lo and hi_w <= w_hi:
                        return amp
        return None

    df['기준_암페어'] = df.apply(get_amp, axis=1)
    df['기준_암페어'].fillna(0, inplace=True)
    df['가공불가_플래그'] = (df['기준_암페어'] == 0).astype(int)

    # 불필요 컬럼 제거
    drop_cols = ['OK','NG','라인','작업장','소유','유형','수량','제품구분','수요가','월','일자','No.','제품번호']
    df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

    # 범주형 인코딩
    for c in df.select_dtypes(include=['object']).columns:
        df[c] = df[c].astype('category')
    cat_cols = df.select_dtypes(include=['category']).columns.tolist()
    oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    df[cat_cols] = oe.fit_transform(df[cat_cols])
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    return df

In [6]:
# 전처리 적용
X_train = preprocess(train)
X_test  = preprocess(test)

# train/test 컬럼 정렬
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
y = train['OK']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['도유'].fillna(most_freq, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['강종'].fillna('일반', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves a

In [7]:
# 모델 학습 및 예측
model = LGBMClassifier(random_state=42)
model.fit(X_train, y)

y_pred_test = model.predict(X_test)

[LightGBM] [Info] Number of positive: 721, number of negative: 1119
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4389
[LightGBM] [Info] Number of data points in the train set: 1840, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.391848 -> initscore=-0.439552
[LightGBM] [Info] Start training from score -0.439552


In [8]:
submission = pd.DataFrame({
    'id': range(len(y_pred_test)),
    'OK': (y_pred_test == 1).astype(int),
    'NG': (y_pred_test == 0).astype(int)
})
submission.to_csv('submission.csv', index=False)

# Colab에서 다운로드
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>