## Import

In [11]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split



In [12]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [13]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [14]:
train_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.531590,2022-06-13 5:47,T050304,A_31,,,,,...,38.70,41.89,46.93,33.09,76.97,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,2022-09-08 14:30,T100306,T_31,2.0,95.0,0.0,45.0,...,,,,,,,,,,
594,TRAIN_594,0,0.524022,2022-09-08 22:38,T050304,A_31,,,,,...,49.47,53.07,50.89,55.10,66.49,1.0,,,,
595,TRAIN_595,0,0.521289,2022-09-08 22:47,T050304,A_31,,,,,...,,,,,,1.0,,,,
596,TRAIN_596,1,0.531375,2022-09-08 14:38,T100304,O_31,40.0,94.0,0.0,45.0,...,,,,,,,,,,


In [15]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df[['Y_Class']]

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Data Pre-processing

In [30]:
train_x = train_x.fillna(train_x.mean())
test_x = test_x.fillna(test_x.mean())
train_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,2,0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,39.3400,40.8900,32.5600,34.0900,77.7700,1.0,,,,
1,3,0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,38.8900,42.8200,43.9200,35.3400,72.5500,1.0,,,,
2,2,0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,39.1900,36.6500,42.4700,36.5300,78.3500,1.0,,,,
3,3,0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,37.7400,39.1700,52.1700,30.5800,71.7800,1.0,,,,
4,2,0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,38.7000,41.8900,46.9300,33.0900,76.9700,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,5,2,2.000000,95.000000,0.0,45.0,10.00000,0.0,50.000000,10.000000,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,,,,
594,2,0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,49.4700,53.0700,50.8900,55.1000,66.4900,1.0,,,,
595,2,0,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,,,,
596,4,1,40.000000,94.000000,0.0,45.0,11.00000,0.0,45.000000,10.000000,...,50.8073,53.6077,49.6062,51.6598,66.6497,1.0,,,,


In [17]:
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T100306,T_31,2.000000,94.00000,0.0,45.0,10.000000,0.0,51.000000,10.000000,...,52.066471,53.014706,49.994412,54.202353,63.806176,0.974359,247.988889,251.888889,243.3,228.844444
1,T100304,T_31,2.000000,93.00000,0.0,45.0,11.000000,0.0,45.000000,10.000000,...,52.066471,53.014706,49.994412,54.202353,63.806176,0.974359,247.988889,251.888889,243.3,228.844444
2,T100304,T_31,2.000000,95.00000,0.0,45.0,11.000000,0.0,45.000000,10.000000,...,52.066471,53.014706,49.994412,54.202353,63.806176,0.974359,247.988889,251.888889,243.3,228.844444
3,T010305,A_31,4.337449,94.44856,0.0,45.0,10.271605,0.0,48.485597,10.012346,...,52.066471,53.014706,49.994412,54.202353,63.806176,0.974359,247.988889,251.888889,243.3,228.844444
4,T010306,A_31,4.337449,94.44856,0.0,45.0,10.271605,0.0,48.485597,10.012346,...,52.066471,53.014706,49.994412,54.202353,63.806176,0.974359,247.988889,251.888889,243.3,228.844444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.000000,91.00000,0.0,45.0,10.000000,0.0,51.000000,10.000000,...,52.066471,53.014706,49.994412,54.202353,63.806176,0.974359,247.988889,251.888889,243.3,228.844444
306,T100304,T_31,2.000000,96.00000,0.0,45.0,11.000000,0.0,45.000000,10.000000,...,52.066471,53.014706,49.994412,54.202353,63.806176,0.974359,247.988889,251.888889,243.3,228.844444
307,T100306,T_31,2.000000,91.00000,0.0,45.0,10.000000,0.0,50.000000,10.000000,...,52.066471,53.014706,49.994412,54.202353,63.806176,0.974359,247.988889,251.888889,243.3,228.844444
308,T100306,T_31,2.000000,95.00000,0.0,45.0,10.000000,0.0,51.000000,10.000000,...,52.066471,53.014706,49.994412,54.202353,63.806176,0.974359,247.988889,251.888889,243.3,228.844444


In [18]:
# train_x = train_x.fillna(0)
# test_x = test_x.fillna(0)

In [19]:
# #nan -> mean
# for i in range(len(train_x.columns)-2):
#     #if np.isnan(train_x[train_x.columns[i+2]].mean())==False:
#        float_mean = float(train_x[train_x.columns[i+2]].sum()/train_x[train_x.columns[i+2]].notnull().sum())
#        train_x[train_x.columns[i+2]] = train_x[train_x.columns[i+2]].fillna(train_x[train_x.columns[i+2]].mean())

# for i in range(len(test_x.columns)-2):
#     #if np.isnan(test_x[test_x.columns[i+2]].mean())==False:      
#         float_mean = float(test_x[test_x.columns[i+2]].sum()/test_x[test_x.columns[i+2]].notnull().sum())
#         test_x[test_x.columns[i+2]] = test_x[test_x.columns[i+2]].fillna(train_x[train_x.columns[i+2]].mean())

In [27]:
# qualitative to quantitative
qual_col = ['PRODUCT_CODE','LINE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


In [21]:
# all_0_columns = train_x.loc[:,(train_x == 0 ).all()].columns

# train_x_del_zeros = train_x.drop(columns=all_0_columns)
# test_x_del_zeros =test_x.drop(columns=all_0_columns)

In [22]:
# d = {
#     'columns': train_x_del_zeros.columns.values,
#     'importances': RF.feature_importances_,
# }

# temp_df = pd.DataFrame(d)
# temp_df = temp_df[temp_df['importances'] != 0]
# temp_df.sort_values(by='importances', ascending=False)
# temp_df[temp_df['importances'] > 1/1704]['columns'].values

In [23]:
# import plotly.express as px

# px.line(data_frame=temp_df,
#         x='columns',
#         y='importances')

In [24]:
# train_x_v2 = train_df[temp_df[temp_df['importances'] > 1/1704]['columns'].values]
# train_x_v2 = train_x_v2.fillna(0)

# test_x_v2 = test_df[temp_df[temp_df['importances'] > 1/1704]['columns'].values]
# test_x_v2 = test_x_v2.fillna(0)


# Classifier Model Fit

# GaussianNB Model Fit

In [28]:
target = train_df['Y_Class']
target = [0 if item == 2 else item for item in target]
features = train_x
scores=[]
for i in range(3):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    clf = GaussianNB()
    clf.fit(X_train_rand, y_train_rand)

    scores.append(clf.score(X_test_rand, y_test_rand))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")



ValueError: Input X contains NaN.
GaussianNB does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

# RandomForest Classifier Model Fit

In [None]:
target = train_df['Y_Class']
target = [0 if item == 2 else item for item in target]
features = train_x
scores=[]
for i in range(3):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    RF_rand = RandomForestClassifier()
    RF_rand.fit(X_train_rand, y_train_rand)

    scores.append(RF_rand.score(X_test_rand, y_test_rand))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")



평균: 0.7758333333333333
최소: 0.6833333333333333
최대: 0.825


In [None]:
train_y.loc[train_y['Y_Class']==2, 'Y_Class']


1      2
3      2
5      2
7      2
9      2
      ..
563    2
564    2
570    2
572    2
578    2
Name: Y_Class, Length: 103, dtype: int64

# CatBoost Classifier Model Fit

In [None]:
target = train_df['Y_Class']
target = [0 if item == 2 else item for item in target]

features = train_x
scores=[]
for i in range(3):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                                            target, 
                                                                            test_size=0.2, 
                                                                            )

    catboost_clf = CatBoostClassifier(iterations=40,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True)
    
    catboost_clf.fit(X_train_rand, y_train_rand)

    scores.append(catboost_clf.score(X_test_rand, y_test_rand))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")

0:	learn: 0.5420729	total: 16.4ms	remaining: 6.53s
1:	learn: 0.5222961	total: 26.4ms	remaining: 5.26s
2:	learn: 0.5118994	total: 36.7ms	remaining: 4.85s
3:	learn: 0.4969984	total: 46.9ms	remaining: 4.64s
4:	learn: 0.4761095	total: 57ms	remaining: 4.5s
5:	learn: 0.4614268	total: 66.8ms	remaining: 4.39s
6:	learn: 0.4379969	total: 77.3ms	remaining: 4.34s
7:	learn: 0.4168454	total: 87.3ms	remaining: 4.28s
8:	learn: 0.3988042	total: 97.2ms	remaining: 4.22s
9:	learn: 0.3775261	total: 107ms	remaining: 4.17s
10:	learn: 0.3568400	total: 117ms	remaining: 4.13s
11:	learn: 0.3350835	total: 127ms	remaining: 4.11s
12:	learn: 0.3228550	total: 137ms	remaining: 4.08s
13:	learn: 0.3033711	total: 147ms	remaining: 4.06s
14:	learn: 0.2916732	total: 159ms	remaining: 4.07s
15:	learn: 0.2872114	total: 171ms	remaining: 4.09s
16:	learn: 0.2749441	total: 182ms	remaining: 4.11s
17:	learn: 0.2643738	total: 195ms	remaining: 4.14s
18:	learn: 0.2537936	total: 209ms	remaining: 4.19s
19:	learn: 0.2452018	total: 222ms	r

# XGB Classifier Model Fit

In [None]:
# import xgboost as xgb

# # XGBoost를 사용하기 위해서는 DMatrix 형태로 변환해 주어야 합니다
# dtrain = xgb.DMatrix(train_x, train_y)
# dtest = xgb.DMatrix(test_x)

# # 모델 생성
# # num_boost_round 만큼 반복하는데 early_stopping_rounds 만큼 성능 향상이 없으면 중단
# # early_stopping_rounds를 사용하려면 eval 데이터 셋을 명기해야함
# param = {파라미터 설정}
# xgb_model = xgb.train(params = params, dtrain = dtrain, num_boost_round = 400, 
#                         early_stopping_rounds = 100, evals=[(dtrain,'train'),(dval,'eval')])

# # 예측하기, 확률값으로 반환됨
# y_pre_probs = xgb_model.predict(dtest)

# # 0또는 1로 변경
# y_preds = [1 if x>0.5 else 0 for x in y_pre_probs]

# Voting Classifier Model Fit -> Merging Multiple Models

In [None]:
target = train_df['Y_Class']
target = [0 if item == 2 else item for item in target]
features = train_x
scores=[]
for i in range(3):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    Voting_Clf = VotingClassifier(estimators=[ 
                                              ('RandomForest', RF_rand) ,
                                              ('GaussianNB', clf),
                                                ('CatBoostClassifier', catboost_clf)],
                                  voting='soft')
    
    Voting_Clf.fit(X_train_rand, y_train_rand)

    scores.append(Voting_Clf.score(X_test_rand, y_test_rand))

    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")

0:	learn: 0.5398924	total: 24.3ms	remaining: 9.68s
1:	learn: 0.5319689	total: 38.2ms	remaining: 7.59s
2:	learn: 0.5238678	total: 50.2ms	remaining: 6.65s
3:	learn: 0.4997631	total: 62.9ms	remaining: 6.23s
4:	learn: 0.4933045	total: 75.5ms	remaining: 5.97s
5:	learn: 0.4673391	total: 88.6ms	remaining: 5.82s
6:	learn: 0.4580461	total: 103ms	remaining: 5.76s
7:	learn: 0.4381720	total: 113ms	remaining: 5.55s
8:	learn: 0.4234933	total: 131ms	remaining: 5.68s
9:	learn: 0.4026468	total: 141ms	remaining: 5.49s
10:	learn: 0.3818843	total: 151ms	remaining: 5.33s
11:	learn: 0.3662018	total: 160ms	remaining: 5.19s
12:	learn: 0.3539154	total: 170ms	remaining: 5.07s
13:	learn: 0.3373814	total: 180ms	remaining: 4.97s
14:	learn: 0.3263898	total: 190ms	remaining: 4.88s
15:	learn: 0.3108265	total: 200ms	remaining: 4.81s
16:	learn: 0.3009302	total: 210ms	remaining: 4.74s
17:	learn: 0.2903745	total: 221ms	remaining: 4.7s
18:	learn: 0.2727070	total: 231ms	remaining: 4.64s
19:	learn: 0.2647478	total: 242ms	re

In [None]:
preds = Voting_Clf.predict(test_x)

# F1_Score

In [None]:
f1_temp = train_y['Y_Class'].values
f1_temp[f1_temp==2]=0
f1_temp

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,

In [None]:
# from sklearn.metrics import f1_score
# f1_score(f1_temp, preds)

0.9676646706586826

## Submit

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['Y_Class'] = preds

In [None]:
submit.to_csv('./baseline_submission.csv', index=False)