## Import

In [13]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score



# Variables

In [14]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [15]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [16]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df[['Y_Class']]

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

f1_temp = train_y['Y_Class'].values
f1_temp[f1_temp==2]=0


## Data Pre-processing

In [17]:
all_nan_columns = train_x.loc[:,(train_x.isna() == True ).all(axis=0)].columns
# train_x = train_x -train_x.loc[:,(train_x.isna() == True ).all(axis=0)]
train_x = train_x.drop(columns=all_nan_columns )
test_x = test_x.drop(columns=all_nan_columns)
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
3,T010305,A_31,,,,,,,,,...,183.8,467.0,444.192308,423.0,,,,,,
4,T010306,A_31,,,,,,,,,...,179.7,465.0,443.211539,423.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
306,T100304,T_31,2.0,96.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
307,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
308,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,


In [18]:
train_x_mean = train_x.fillna(train_x.mean())
test_x_mean = test_x.fillna(test_x.mean())

train_x_zero = train_x.fillna(0)
test_x_zero = test_x.fillna(0)

train_x_mean

  train_x_mean = train_x.fillna(train_x.mean())
  test_x_mean = test_x.fillna(test_x.mean())


Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871
0,T050304,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,189.000000,383.000000,368.296296,353.000000,39.3400,40.8900,32.5600,34.0900,77.7700,1.0
1,T050307,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,185.600000,383.000000,367.735849,353.000000,38.8900,42.8200,43.9200,35.3400,72.5500,1.0
2,T050304,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,165.500000,383.000000,367.320755,353.000000,39.1900,36.6500,42.4700,36.5300,78.3500,1.0
3,T050307,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,165.800000,384.000000,369.188679,353.000000,37.7400,39.1700,52.1700,30.5800,71.7800,1.0
4,T050304,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,182.600000,383.000000,367.351852,352.000000,38.7000,41.8900,46.9300,33.0900,76.9700,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,T100306,T_31,2.000000,95.000000,0.0,45.0,10.00000,0.0,50.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
594,T050304,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,168.700000,384.000000,369.811321,353.000000,49.4700,53.0700,50.8900,55.1000,66.4900,1.0
595,T050304,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,156.600000,383.000000,367.018868,352.000000,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
596,T100304,O_31,40.000000,94.000000,0.0,45.0,11.00000,0.0,45.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0


# Model Fit Variables

In [19]:
MODEL_TEST_REPEAT=10
TRAINING_DATA=train_x_mean
TESTING_DATA=test_x_mean

# Label Encoding

In [20]:
# qualitative to quantitative
qual_col = ['PRODUCT_CODE','LINE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(TRAINING_DATA[i])
    TRAINING_DATA[i] = le.transform(TRAINING_DATA[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    TESTING_DATA[i] = le.transform(TESTING_DATA[i]) 
print('Done.')

Done.


In [21]:
# d = {
#     'columns': train_x_del_zeros.columns.values,
#     'importances': RF.feature_importances_,
# }

# temp_df = pd.DataFrame(d)
# temp_df = temp_df[temp_df['importances'] != 0]
# temp_df.sort_values(by='importances', ascending=False)
# temp_df[temp_df['importances'] > 1/1704]['columns'].values

In [22]:
# import plotly.express as px

# px.line(data_frame=temp_df,
#         x='columns',
#         y='importances')

# Resampling by imortances

In [23]:
# train_x_v2 = train_df[temp_df[temp_df['importances'] > 1/1704]['columns'].values]
# train_x_v2 = train_x_v2.fillna(0)

# test_x_v2 = test_df[temp_df[temp_df['importances'] > 1/1704]['columns'].values]
# test_x_v2 = test_x_v2.fillna(0)


# Classifier Model Fit

# GaussianNB Model Fit

In [24]:
target = train_df['Y_Class']

features = TRAINING_DATA
scores=[]
for i in range(MODEL_TEST_REPEAT):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    clf = GaussianNB()
    clf.fit(X_train_rand, y_train_rand)

    preds = clf.predict(X_test_rand)
    scores.append(f1_score(y_test_rand, preds,average='macro'))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")



평균: 0.4913268066680442
최소: 0.4454501560383913
최대: 0.547460963395853


# RandomForest Classifier Model Fit

In [25]:
target = train_df['Y_Class']

features = TRAINING_DATA
scores=[]
for i in range(MODEL_TEST_REPEAT):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    RF_rand = RandomForestClassifier()
    RF_rand.fit(X_train_rand, y_train_rand)

    preds = RF_rand.predict(X_test_rand)
    scores.append(f1_score(y_test_rand, preds,average='macro'))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")



평균: 0.576536661861911
최소: 0.4422705314009661
최대: 0.665823216416772


# CatBoost Classifier Model Fit

In [35]:
target = train_df['Y_Class']


features = TRAINING_DATA
scores=[]
for i in range(MODEL_TEST_REPEAT):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                                            target, 
                                                                            test_size=0.2, 
                                                                            )

    catboost_clf = CatBoostClassifier(iterations=400,
                           depth=2,
                           learning_rate=1,
                           loss_function='MultiClass',
                           verbose=True,
                           )
    
    catboost_clf.fit(X_train_rand, y_train_rand)

    preds = catboost_clf.predict(X_test_rand)
    scores.append(f1_score(y_test_rand, preds,average='macro'))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")

0:	learn: 0.7959833	total: 211ms	remaining: 1m 24s
1:	learn: 0.7531970	total: 230ms	remaining: 45.8s
2:	learn: 0.7374645	total: 247ms	remaining: 32.6s
3:	learn: 0.7112807	total: 262ms	remaining: 26s
4:	learn: 0.6840399	total: 278ms	remaining: 21.9s
5:	learn: 0.6628841	total: 294ms	remaining: 19.3s
6:	learn: 0.6484422	total: 313ms	remaining: 17.6s
7:	learn: 0.6337892	total: 332ms	remaining: 16.3s
8:	learn: 0.6113160	total: 354ms	remaining: 15.4s
9:	learn: 0.5852092	total: 371ms	remaining: 14.5s
10:	learn: 0.5625538	total: 389ms	remaining: 13.8s
11:	learn: 0.5508000	total: 405ms	remaining: 13.1s
12:	learn: 0.5458212	total: 420ms	remaining: 12.5s
13:	learn: 0.5405913	total: 436ms	remaining: 12s
14:	learn: 0.5335003	total: 450ms	remaining: 11.6s
15:	learn: 0.5160125	total: 467ms	remaining: 11.2s
16:	learn: 0.5018473	total: 484ms	remaining: 10.9s
17:	learn: 0.4824149	total: 501ms	remaining: 10.6s
18:	learn: 0.4765251	total: 523ms	remaining: 10.5s
19:	learn: 0.4674986	total: 545ms	remaining:

# XGB Classifier Model Fit

In [None]:
# import xgboost as xgb

# # XGBoost를 사용하기 위해서는 DMatrix 형태로 변환해 주어야 합니다
# dtrain = xgb.DMatrix(train_x, train_y)
# dtest = xgb.DMatrix(test_x)

# # 모델 생성
# # num_boost_round 만큼 반복하는데 early_stopping_rounds 만큼 성능 향상이 없으면 중단
# # early_stopping_rounds를 사용하려면 eval 데이터 셋을 명기해야함
# param = {파라미터 설정}
# xgb_model = xgb.train(params = params, dtrain = dtrain, num_boost_round = 400, 
#                         early_stopping_rounds = 100, evals=[(dtrain,'train'),(dval,'eval')])

# # 예측하기, 확률값으로 반환됨
# y_pre_probs = xgb_model.predict(dtest)

# # 0또는 1로 변경
# y_preds = [1 if x>0.5 else 0 for x in y_pre_probs]

# Voting Classifier Model Fit -> Merging Multiple Models

In [None]:
target = train_df['Y_Class']
features = TRAINING_DATA
scores=[]
for i in range(MODEL_TEST_REPEAT):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    Voting_Clf = VotingClassifier(estimators=[ 
                                              ('RandomForest', RF_rand) ,
                                              ('GaussianNB', clf),
                                                ('CatBoostClassifier', catboost_clf)],
                                  voting='soft')
    
    Voting_Clf.fit(X_train_rand, y_train_rand)
    
    
    preds = Voting_Clf.predict(X_test_rand)
    scores.append(f1_score(y_test_rand, preds,average='macro'))

    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")

0:	learn: 0.5426655	total: 18.3ms	remaining: 7.3s
1:	learn: 0.5258627	total: 28.3ms	remaining: 5.63s
2:	learn: 0.5103263	total: 38.7ms	remaining: 5.12s
3:	learn: 0.4994389	total: 49.4ms	remaining: 4.89s
4:	learn: 0.4674044	total: 64.1ms	remaining: 5.07s
5:	learn: 0.4535009	total: 74.7ms	remaining: 4.91s
6:	learn: 0.4316013	total: 85.9ms	remaining: 4.82s
7:	learn: 0.4117869	total: 96ms	remaining: 4.71s
8:	learn: 0.3946341	total: 106ms	remaining: 4.61s
9:	learn: 0.3775368	total: 117ms	remaining: 4.55s
10:	learn: 0.3568401	total: 143ms	remaining: 5.07s
11:	learn: 0.3390582	total: 172ms	remaining: 5.55s
12:	learn: 0.3250588	total: 182ms	remaining: 5.41s
13:	learn: 0.3067973	total: 192ms	remaining: 5.3s
14:	learn: 0.2945372	total: 203ms	remaining: 5.2s
15:	learn: 0.2755800	total: 214ms	remaining: 5.13s
16:	learn: 0.2627545	total: 224ms	remaining: 5.06s
17:	learn: 0.2521317	total: 236ms	remaining: 5s
18:	learn: 0.2334792	total: 249ms	remaining: 4.99s
19:	learn: 0.2146281	total: 261ms	remaini

In [None]:
# preds = Voting_Clf.predict(train_x)

# F1_Score

In [None]:
# f1_temp = train_y['Y_Class'].values
# f1_temp[f1_temp==2]=0
# f1_temp

In [None]:
# f1_score(f1_temp, preds)

## Submit

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['Y_Class'] = preds

In [None]:
submit.to_csv('./baseline_submission.csv', index=False)