## Import

In [34]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score



# Variables

In [11]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [12]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [13]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df[['Y_Class']]

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

f1_temp = train_y['Y_Class'].values
f1_temp[f1_temp==2]=0


## Data Pre-processing

In [14]:
all_nan_columns = train_x.loc[:,(train_x.isna() == True ).all(axis=0)].columns
# train_x = train_x -train_x.loc[:,(train_x.isna() == True ).all(axis=0)]
train_x = train_x.drop(columns=all_nan_columns )
test_x = test_x.drop(columns=all_nan_columns)
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
3,T010305,A_31,,,,,,,,,...,183.8,467.0,444.192308,423.0,,,,,,
4,T010306,A_31,,,,,,,,,...,179.7,465.0,443.211539,423.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
306,T100304,T_31,2.0,96.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
307,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
308,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,


In [15]:
train_x_mean = train_x.fillna(train_x.mean())
test_x_mean = test_x.fillna(train_x.mean())

train_x_zero = train_x.fillna(0)
test_x_zero = test_x.fillna(0)

test_x_mean

  train_x_mean = train_x.fillna(train_x.mean())
  test_x_mean = test_x.fillna(train_x.mean())


Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871
0,T100306,T_31,2.000000,94.000000,0.0,45.0,10.00000,0.0,51.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
1,T100304,T_31,2.000000,93.000000,0.0,45.0,11.00000,0.0,45.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
2,T100304,T_31,2.000000,95.000000,0.0,45.0,11.00000,0.0,45.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
3,T010305,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,183.800000,467.000000,444.192308,423.000000,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
4,T010306,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,179.700000,465.000000,443.211539,423.000000,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.000000,91.000000,0.0,45.0,10.00000,0.0,51.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
306,T100304,T_31,2.000000,96.000000,0.0,45.0,11.00000,0.0,45.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
307,T100306,T_31,2.000000,91.000000,0.0,45.0,10.00000,0.0,50.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
308,T100306,T_31,2.000000,95.000000,0.0,45.0,10.00000,0.0,51.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0


In [16]:
test_x_mean.isna().values.any()

False

# Model Fit Variables

In [37]:
MODEL_TEST_REPEAT=10
TRAINING_DATA=train_x_zero
TESTING_DATA=test_x_zero

# Label Encoding

In [18]:
# qualitative to quantitative
qual_col = ['PRODUCT_CODE','LINE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(TRAINING_DATA[i])
    TRAINING_DATA[i] = le.transform(TRAINING_DATA[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    TESTING_DATA[i] = le.transform(TESTING_DATA[i]) 
print('Done.')

Done.


In [19]:
# d = {
#     'columns': train_x_del_zeros.columns.values,
#     'importances': RF.feature_importances_,
# }

# temp_df = pd.DataFrame(d)
# temp_df = temp_df[temp_df['importances'] != 0]
# temp_df.sort_values(by='importances', ascending=False)
# temp_df[temp_df['importances'] > 1/1704]['columns'].values

In [20]:
# import plotly.express as px

# px.line(data_frame=temp_df,
#         x='columns',
#         y='importances')

# Resampling by imortances

In [21]:
# train_x_v2 = train_df[temp_df[temp_df['importances'] > 1/1704]['columns'].values]
# train_x_v2 = train_x_v2.fillna(0)

# test_x_v2 = test_df[temp_df[temp_df['importances'] > 1/1704]['columns'].values]
# test_x_v2 = test_x_v2.fillna(0)


# Classifier Model Fit

# GaussianNB Model Fit

In [22]:
target = train_df['Y_Class']

features = TRAINING_DATA
scores=[]
for i in range(MODEL_TEST_REPEAT):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    clf = GaussianNB()
    clf.fit(X_train_rand, y_train_rand)

    preds = clf.predict(X_test_rand)
    scores.append(f1_score(y_test_rand, preds,average='macro'))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")



평균: 0.4913268066680442
최소: 0.4454501560383913
최대: 0.547460963395853


# RandomForest Classifier Model Fit

In [23]:
target = train_df['Y_Class']

features = TRAINING_DATA
scores=[]
for i in range(MODEL_TEST_REPEAT):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    RF_rand = RandomForestClassifier()
    RF_rand.fit(X_train_rand, y_train_rand)

    preds = RF_rand.predict(X_test_rand)
    scores.append(f1_score(y_test_rand, preds,average='macro'))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")



평균: 0.576536661861911
최소: 0.4422705314009661
최대: 0.665823216416772


# CatBoost Classifier Model Fit

In [24]:
target = train_df['Y_Class']


features = TRAINING_DATA
cat_features = TRAINING_DATA.columns[TRAINING_DATA.nunique() > 2].tolist()
scores=[]
for i in range(MODEL_TEST_REPEAT):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                                            target, 
                                                                            test_size=0.2, 
                                                                            )

    catboost_clf = CatBoostClassifier(iterations=40,
                           depth=2,
                           learning_rate=0.02,
                           cat_features=cat_features,
                           loss_function='MultiClass',
                           verbose=True)
    
    catboost_clf.fit(X_train_rand, y_train_rand)

    preds = catboost_clf.predict(X_test_rand)
    scores.append(f1_score(y_test_rand, preds,average='macro'))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")

0:	learn: 1.0846866	total: 182ms	remaining: 1m 12s
1:	learn: 1.0728091	total: 200ms	remaining: 39.8s
2:	learn: 1.0609184	total: 218ms	remaining: 28.9s
3:	learn: 1.0487776	total: 237ms	remaining: 23.5s
4:	learn: 1.0381589	total: 256ms	remaining: 20.2s
5:	learn: 1.0284681	total: 271ms	remaining: 17.8s
6:	learn: 1.0182840	total: 290ms	remaining: 16.3s
7:	learn: 1.0086609	total: 307ms	remaining: 15.1s
8:	learn: 1.0000822	total: 323ms	remaining: 14s
9:	learn: 0.9912464	total: 339ms	remaining: 13.2s
10:	learn: 0.9822355	total: 355ms	remaining: 12.6s
11:	learn: 0.9738514	total: 370ms	remaining: 11.9s
12:	learn: 0.9654918	total: 384ms	remaining: 11.4s
13:	learn: 0.9581111	total: 401ms	remaining: 11s
14:	learn: 0.9509606	total: 420ms	remaining: 10.8s
15:	learn: 0.9433115	total: 438ms	remaining: 10.5s
16:	learn: 0.9368073	total: 465ms	remaining: 10.5s
17:	learn: 0.9309062	total: 482ms	remaining: 10.2s
18:	learn: 0.9244682	total: 499ms	remaining: 10s
19:	learn: 0.9175352	total: 516ms	remaining: 9

In [39]:
# grid_parameters = {'depth'         : [6,8,10],
#                     'learning_rate' : [0.01, 0.05, 0.1],
#                     'iterations'    : [30, 50, 100]
#                     }

# grid = GridSearchCV(estimator=catboost_clf, param_grid = grid_parameters, cv = 2, n_jobs=-1)
# grid.fit(X_train_rand, y_train_rand)

# print(" Results from Grid Search " )
# print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
# print("\n The best score across ALL searched params:\n", grid.best_score_)
# print("\n The best parameters across ALL searched params:\n", grid.best_params_)


# XGB Classifier Model Fit

In [25]:
# import xgboost as xgb

# # XGBoost를 사용하기 위해서는 DMatrix 형태로 변환해 주어야 합니다
# dtrain = xgb.DMatrix(train_x, train_y)
# dtest = xgb.DMatrix(test_x)

# # 모델 생성
# # num_boost_round 만큼 반복하는데 early_stopping_rounds 만큼 성능 향상이 없으면 중단
# # early_stopping_rounds를 사용하려면 eval 데이터 셋을 명기해야함
# param = {파라미터 설정}
# xgb_model = xgb.train(params = params, dtrain = dtrain, num_boost_round = 400, 
#                         early_stopping_rounds = 100, evals=[(dtrain,'train'),(dval,'eval')])

# # 예측하기, 확률값으로 반환됨
# y_pre_probs = xgb_model.predict(dtest)

# # 0또는 1로 변경
# y_preds = [1 if x>0.5 else 0 for x in y_pre_probs]

# Voting Classifier Model Fit -> Merging Multiple Models

In [26]:
target = train_df['Y_Class']
features = TRAINING_DATA
scores=[]
for i in range(MODEL_TEST_REPEAT):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    Voting_Clf = VotingClassifier(estimators=[ 
                                              ('RandomForest', RF_rand) ,
                                              ('GaussianNB', clf),
                                                ('CatBoostClassifier', catboost_clf)],
                                  voting='soft')
    
    Voting_Clf.fit(X_train_rand, y_train_rand)
    
    
    preds = Voting_Clf.predict(X_test_rand)
    scores.append(f1_score(y_test_rand, preds,average='macro'))

    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")

0:	learn: 1.0849954	total: 33ms	remaining: 13.2s
1:	learn: 1.0733103	total: 48.9ms	remaining: 9.72s
2:	learn: 1.0621647	total: 64.9ms	remaining: 8.58s
3:	learn: 1.0514941	total: 79.8ms	remaining: 7.9s
4:	learn: 1.0406757	total: 95.6ms	remaining: 7.55s
5:	learn: 1.0310795	total: 114ms	remaining: 7.48s
6:	learn: 1.0213220	total: 132ms	remaining: 7.4s
7:	learn: 1.0119347	total: 147ms	remaining: 7.21s
8:	learn: 1.0028445	total: 164ms	remaining: 7.15s
9:	learn: 0.9940147	total: 184ms	remaining: 7.16s
10:	learn: 0.9851871	total: 201ms	remaining: 7.12s
11:	learn: 0.9775152	total: 217ms	remaining: 7.02s
12:	learn: 0.9709361	total: 236ms	remaining: 7.03s
13:	learn: 0.9639166	total: 254ms	remaining: 7s
14:	learn: 0.9564768	total: 273ms	remaining: 7s
15:	learn: 0.9494358	total: 300ms	remaining: 7.2s
16:	learn: 0.9429087	total: 342ms	remaining: 7.7s
17:	learn: 0.9367251	total: 360ms	remaining: 7.63s
18:	learn: 0.9304142	total: 373ms	remaining: 7.49s
19:	learn: 0.9242482	total: 387ms	remaining: 7.3

In [27]:
preds = Voting_Clf.predict(TESTING_DATA)

# F1_Score

In [28]:
# f1_temp = train_y['Y_Class'].values
# f1_temp[f1_temp==2]=0
# f1_temp

In [29]:
# f1_score(f1_temp, preds)

## Submit

In [30]:
submit = pd.read_csv('./sample_submission.csv')

In [31]:
submit['Y_Class'] = preds

In [32]:
submit.to_csv('./baseline_submission.csv', index=False)