## Import

In [38]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

In [39]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [40]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [41]:
train_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.531590,2022-06-13 5:47,T050304,A_31,,,,,...,38.70,41.89,46.93,33.09,76.97,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,2022-09-08 14:30,T100306,T_31,2.0,95.0,0.0,45.0,...,,,,,,,,,,
594,TRAIN_594,0,0.524022,2022-09-08 22:38,T050304,A_31,,,,,...,49.47,53.07,50.89,55.10,66.49,1.0,,,,
595,TRAIN_595,0,0.521289,2022-09-08 22:47,T050304,A_31,,,,,...,,,,,,1.0,,,,
596,TRAIN_596,1,0.531375,2022-09-08 14:38,T100304,O_31,40.0,94.0,0.0,45.0,...,,,,,,,,,,


In [42]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality','LINE'])
train_y = train_df[['Y_Class']]

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP','LINE'])

## Data Pre-processing

In [43]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [44]:
# qualitative to quantitative
qual_col = ['PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


In [46]:
all_0_columns = train_x.loc[:,(train_x == 0 ).all()].columns

train_x_del_zeros = train_x.drop(columns=all_0_columns)
test_x_del_zeros =test_x.drop(columns=all_0_columns)

In [56]:
d = {
    'columns': train_x_del_zeros.columns.values,
    'importances': RF.feature_importances_,
}

temp_df = pd.DataFrame(d)
temp_df = temp_df[temp_df['importances'] != 0]
temp_df.sort_values(by='importances', ascending=False)
temp_df[temp_df['importances'] > 1/1704]['columns'].values

array(['X_2', 'X_7', 'X_11', 'X_12', 'X_13', 'X_20', 'X_21', 'X_22',
       'X_45', 'X_57', 'X_62', 'X_63', 'X_73', 'X_98', 'X_101', 'X_102',
       'X_105', 'X_107', 'X_117', 'X_120', 'X_121', 'X_123', 'X_124',
       'X_127', 'X_130', 'X_131', 'X_141', 'X_149', 'X_189', 'X_240',
       'X_242', 'X_243', 'X_248', 'X_256', 'X_257', 'X_258', 'X_265',
       'X_266', 'X_267', 'X_285', 'X_287', 'X_289', 'X_294', 'X_297',
       'X_300', 'X_301', 'X_307', 'X_318', 'X_335', 'X_337', 'X_339',
       'X_345', 'X_353', 'X_354', 'X_358', 'X_367', 'X_368', 'X_373',
       'X_374', 'X_380', 'X_385', 'X_387', 'X_388', 'X_394', 'X_395',
       'X_397', 'X_398', 'X_399', 'X_400', 'X_401', 'X_404', 'X_413',
       'X_415', 'X_416', 'X_419', 'X_421', 'X_422', 'X_428', 'X_435',
       'X_437', 'X_439', 'X_440', 'X_442', 'X_443', 'X_448', 'X_452',
       'X_453', 'X_454', 'X_455', 'X_456', 'X_457', 'X_458', 'X_460',
       'X_461', 'X_462', 'X_463', 'X_465', 'X_468', 'X_469', 'X_471',
       'X_473', 'X

In [57]:
import plotly.express as px

px.line(data_frame=temp_df,
        x='columns',
        y='importances')

In [58]:
train_x_v2 = train_df[temp_df[temp_df['importances'] > 1/1704]['columns'].values]
train_x_v2 = train_x_v2.fillna(0)

test_x_v2 = test_df[temp_df[temp_df['importances'] > 1/1704]['columns'].values]
test_x_v2 = test_x_v2.fillna(0)


# Classifier Model Fit

# GaussianNB Model Fit

In [63]:
from sklearn.naive_bayes import GaussianNB



from sklearn.model_selection import train_test_split

target = train_df['Y_Class']
target = [0 if item == 2 else item for item in target]
features = train_x
scores=[]
for i in range(100):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    clf = GaussianNB()
    clf.fit(X_train_rand, y_train_rand)

    scores.append(clf.score(X_test_rand, y_test_rand))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")



평균: 0.7276666666666665
최소: 0.6166666666666667
최대: 0.825


# RandomFOrest Classifier Model Fit

In [67]:
from sklearn.model_selection import train_test_split

target = train_df['Y_Class']
target = [0 if item == 2 else item for item in target]
features = train_x_v2
scores=[]
for i in range(10):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    RF_rand = RandomForestClassifier()
    RF_rand.fit(X_train_rand, y_train_rand)

    scores.append(RF_rand.score(X_test_rand, y_test_rand))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")



평균: 0.7875
최소: 0.7166666666666667
최대: 0.8916666666666667


In [69]:
train_y.loc[train_y['Y_Class']==2, 'Y_Class']


1      2
3      2
5      2
7      2
9      2
      ..
563    2
564    2
570    2
572    2
578    2
Name: Y_Class, Length: 103, dtype: int64

# CatBoost Classifier Model Fit

In [70]:
from sklearn.ensemble import VotingClassifier


from sklearn.model_selection import train_test_split

target = train_df['Y_Class']

target = [0 if item == 2 else item for item in target]

features = train_x
scores=[]
for i in range(10):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                                            target, 
                                                                            test_size=0.2, 
                                                                            )

    catboost_clf = CatBoostClassifier(iterations=400,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True)
    
    catboost_clf.fit(X_train_rand, y_train_rand)

    scores.append(catboost_clf.score(X_test_rand, y_test_rand))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")

0:	learn: 0.5495523	total: 12.7ms	remaining: 5.07s
1:	learn: 0.5249758	total: 23ms	remaining: 4.58s
2:	learn: 0.5033082	total: 33ms	remaining: 4.36s
3:	learn: 0.4824982	total: 43ms	remaining: 4.26s
4:	learn: 0.4727101	total: 53.3ms	remaining: 4.21s
5:	learn: 0.4617525	total: 63.3ms	remaining: 4.16s
6:	learn: 0.4479368	total: 73.2ms	remaining: 4.11s
7:	learn: 0.4232482	total: 83.2ms	remaining: 4.08s
8:	learn: 0.3959367	total: 94.5ms	remaining: 4.11s
9:	learn: 0.3731081	total: 104ms	remaining: 4.07s
10:	learn: 0.3633664	total: 114ms	remaining: 4.04s
11:	learn: 0.3508813	total: 124ms	remaining: 4.01s
12:	learn: 0.3313074	total: 134ms	remaining: 3.98s
13:	learn: 0.3123792	total: 145ms	remaining: 4.01s
14:	learn: 0.3018227	total: 155ms	remaining: 3.98s
15:	learn: 0.2935227	total: 166ms	remaining: 3.98s
16:	learn: 0.2770588	total: 176ms	remaining: 3.96s
17:	learn: 0.2656530	total: 186ms	remaining: 3.94s
18:	learn: 0.2602199	total: 195ms	remaining: 3.91s
19:	learn: 0.2446563	total: 223ms	rema

# Voting Classifier Model Fit -> Merging Multiple Models

In [71]:
from sklearn.ensemble import VotingClassifier


from sklearn.model_selection import train_test_split

target = train_df['Y_Class']
target = [0 if item == 2 else item for item in target]
features = train_x
scores=[]
for i in range(10):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    Voting_Clf = VotingClassifier(estimators=[ 
                                              ('RandomForest', RF_rand) ,
                                              ('GaussianNB', clf),
                                                ('CatBoostClassifier', catboost_clf)],
                                  voting='soft')
    
    Voting_Clf.fit(X_train_rand, y_train_rand)

    scores.append(Voting_Clf.score(X_test_rand, y_test_rand))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")

0:	learn: 0.5638882	total: 27.9ms	remaining: 11.1s
1:	learn: 0.5268696	total: 40.5ms	remaining: 8.06s
2:	learn: 0.5129893	total: 50.9ms	remaining: 6.73s
3:	learn: 0.4952643	total: 60.8ms	remaining: 6.02s
4:	learn: 0.4796106	total: 71.7ms	remaining: 5.66s
5:	learn: 0.4653407	total: 82.4ms	remaining: 5.41s
6:	learn: 0.4569607	total: 95.7ms	remaining: 5.37s
7:	learn: 0.4369314	total: 107ms	remaining: 5.24s
8:	learn: 0.4144454	total: 154ms	remaining: 6.68s
9:	learn: 0.4030172	total: 169ms	remaining: 6.58s
10:	learn: 0.3692962	total: 184ms	remaining: 6.52s
11:	learn: 0.3505526	total: 195ms	remaining: 6.31s
12:	learn: 0.3420328	total: 205ms	remaining: 6.12s
13:	learn: 0.3287588	total: 216ms	remaining: 5.97s
14:	learn: 0.3110955	total: 227ms	remaining: 5.82s
15:	learn: 0.3015790	total: 238ms	remaining: 5.7s
16:	learn: 0.2951367	total: 254ms	remaining: 5.72s
17:	learn: 0.2801772	total: 269ms	remaining: 5.71s
18:	learn: 0.2693955	total: 282ms	remaining: 5.65s
19:	learn: 0.2520600	total: 295ms	r

In [75]:
preds = Voting_Clf.predict(test_x)

## Submit

In [76]:
submit = pd.read_csv('./sample_submission.csv')

In [77]:
submit['Y_Class'] = preds

In [78]:
submit.to_csv('./baseline_submission.csv', index=False)