## Import

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score



# Variables

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [3]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [4]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df[['Y_Class']]

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

f1_temp = train_y['Y_Class'].values
f1_temp[f1_temp==2]=0


## Data Pre-processing

In [5]:
all_nan_columns = train_x.loc[:,(train_x.isna() == True ).all(axis=0)].columns
# train_x = train_x -train_x.loc[:,(train_x.isna() == True ).all(axis=0)]
train_x = train_x.drop(columns=all_nan_columns )
test_x = test_x.drop(columns=all_nan_columns)
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
3,T010305,A_31,,,,,,,,,...,183.8,467.0,444.192308,423.0,,,,,,
4,T010306,A_31,,,,,,,,,...,179.7,465.0,443.211539,423.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
306,T100304,T_31,2.0,96.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
307,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
308,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,


In [6]:
train_x_mean = train_x.fillna(train_x.mean())
test_x_mean = test_x.fillna(test_x.mean())

train_x_zero = train_x.fillna(0)
test_x_zero = test_x.fillna(0)

train_x_mean

  train_x_mean = train_x.fillna(train_x.mean())
  test_x_mean = test_x.fillna(test_x.mean())


Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871
0,T050304,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,189.000000,383.000000,368.296296,353.000000,39.3400,40.8900,32.5600,34.0900,77.7700,1.0
1,T050307,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,185.600000,383.000000,367.735849,353.000000,38.8900,42.8200,43.9200,35.3400,72.5500,1.0
2,T050304,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,165.500000,383.000000,367.320755,353.000000,39.1900,36.6500,42.4700,36.5300,78.3500,1.0
3,T050307,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,165.800000,384.000000,369.188679,353.000000,37.7400,39.1700,52.1700,30.5800,71.7800,1.0
4,T050304,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,182.600000,383.000000,367.351852,352.000000,38.7000,41.8900,46.9300,33.0900,76.9700,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,T100306,T_31,2.000000,95.000000,0.0,45.0,10.00000,0.0,50.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
594,T050304,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,168.700000,384.000000,369.811321,353.000000,49.4700,53.0700,50.8900,55.1000,66.4900,1.0
595,T050304,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,156.600000,383.000000,367.018868,352.000000,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
596,T100304,O_31,40.000000,94.000000,0.0,45.0,11.00000,0.0,45.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0


# Model Fit Variables

In [7]:
MODEL_TEST_REPEAT=10
TRAINING_DATA=train_x_zero
TESTING_DATA=test_x_zero

# Label Encoding

In [8]:
# qualitative to quantitative
qual_col = ['PRODUCT_CODE','LINE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(TRAINING_DATA[i])
    TRAINING_DATA[i] = le.transform(TRAINING_DATA[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    TESTING_DATA[i] = le.transform(TESTING_DATA[i]) 
print('Done.')

Done.


In [9]:
# d = {
#     'columns': train_x_del_zeros.columns.values,
#     'importances': RF.feature_importances_,
# }

# temp_df = pd.DataFrame(d)
# temp_df = temp_df[temp_df['importances'] != 0]
# temp_df.sort_values(by='importances', ascending=False)
# temp_df[temp_df['importances'] > 1/1704]['columns'].values

In [10]:
# import plotly.express as px

# px.line(data_frame=temp_df,
#         x='columns',
#         y='importances')

# Resampling by imortances

In [11]:
# train_x_v2 = train_df[temp_df[temp_df['importances'] > 1/1704]['columns'].values]
# train_x_v2 = train_x_v2.fillna(0)

# test_x_v2 = test_df[temp_df[temp_df['importances'] > 1/1704]['columns'].values]
# test_x_v2 = test_x_v2.fillna(0)


# Classifier Model Fit

# GaussianNB Model Fit

In [12]:
target = train_df['Y_Class']
target = [0 if item == 2 else item for item in target]
features = TRAINING_DATA
scores=[]
for i in range(MODEL_TEST_REPEAT):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    clf = GaussianNB()
    clf.fit(X_train_rand, y_train_rand)

    preds = clf.predict(X_test_rand)
    scores.append(f1_score(y_test_rand, preds))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")



평균: 0.8198674437797964
최소: 0.7602339181286549
최대: 0.8586956521739129


# RandomForest Classifier Model Fit

In [13]:
target = train_df['Y_Class']
target = [0 if item == 2 else item for item in target]
features = TRAINING_DATA
scores=[]
for i in range(MODEL_TEST_REPEAT):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    RF_rand = RandomForestClassifier()
    RF_rand.fit(X_train_rand, y_train_rand)

    preds = RF_rand.predict(X_test_rand)
    scores.append(f1_score(y_test_rand, preds))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")



평균: 0.8282744995256964
최소: 0.7683615819209039
최대: 0.8587570621468926


In [14]:
train_y.loc[train_y['Y_Class']==2, 'Y_Class']


Series([], Name: Y_Class, dtype: int64)

# CatBoost Classifier Model Fit

In [15]:
target = train_df['Y_Class']
target = [0 if item == 2 else item for item in target]

features = TRAINING_DATA
scores=[]
for i in range(MODEL_TEST_REPEAT):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                                            target, 
                                                                            test_size=0.2, 
                                                                            )

    catboost_clf = CatBoostClassifier(iterations=400,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True)
    
    catboost_clf.fit(X_train_rand, y_train_rand)

    preds = catboost_clf.predict(X_test_rand)
    scores.append(f1_score(y_test_rand, preds))
    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")

0:	learn: 0.5423236	total: 186ms	remaining: 1m 14s
1:	learn: 0.5124819	total: 210ms	remaining: 41.7s
2:	learn: 0.5046231	total: 223ms	remaining: 29.5s
3:	learn: 0.4952111	total: 234ms	remaining: 23.1s
4:	learn: 0.4761212	total: 245ms	remaining: 19.4s
5:	learn: 0.4683110	total: 255ms	remaining: 16.8s
6:	learn: 0.4562932	total: 265ms	remaining: 14.9s
7:	learn: 0.4316274	total: 274ms	remaining: 13.4s
8:	learn: 0.4056396	total: 284ms	remaining: 12.3s
9:	learn: 0.3795789	total: 295ms	remaining: 11.5s
10:	learn: 0.3635333	total: 304ms	remaining: 10.8s
11:	learn: 0.3563891	total: 314ms	remaining: 10.1s
12:	learn: 0.3355942	total: 323ms	remaining: 9.63s
13:	learn: 0.3251059	total: 333ms	remaining: 9.18s
14:	learn: 0.3149354	total: 343ms	remaining: 8.8s
15:	learn: 0.3062571	total: 354ms	remaining: 8.5s
16:	learn: 0.2986914	total: 364ms	remaining: 8.19s
17:	learn: 0.2852574	total: 373ms	remaining: 7.92s
18:	learn: 0.2680004	total: 383ms	remaining: 7.68s
19:	learn: 0.2608384	total: 394ms	remainin

# XGB Classifier Model Fit

In [16]:
# import xgboost as xgb

# # XGBoost를 사용하기 위해서는 DMatrix 형태로 변환해 주어야 합니다
# dtrain = xgb.DMatrix(train_x, train_y)
# dtest = xgb.DMatrix(test_x)

# # 모델 생성
# # num_boost_round 만큼 반복하는데 early_stopping_rounds 만큼 성능 향상이 없으면 중단
# # early_stopping_rounds를 사용하려면 eval 데이터 셋을 명기해야함
# param = {파라미터 설정}
# xgb_model = xgb.train(params = params, dtrain = dtrain, num_boost_round = 400, 
#                         early_stopping_rounds = 100, evals=[(dtrain,'train'),(dval,'eval')])

# # 예측하기, 확률값으로 반환됨
# y_pre_probs = xgb_model.predict(dtest)

# # 0또는 1로 변경
# y_preds = [1 if x>0.5 else 0 for x in y_pre_probs]

# Voting Classifier Model Fit -> Merging Multiple Models

In [17]:
target = train_df['Y_Class']
target = [0 if item == 2 else item for item in target]
features = TRAINING_DATA
scores=[]
for i in range(MODEL_TEST_REPEAT):
    X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
                                                        target, 
                                                        test_size=0.2, 
                                                        )

    Voting_Clf = VotingClassifier(estimators=[ 
                                              ('RandomForest', RF_rand) ,
                                              ('GaussianNB', clf),
                                                ('CatBoostClassifier', catboost_clf)],
                                  voting='soft')
    
    Voting_Clf.fit(X_train_rand, y_train_rand)
    
    
    preds = Voting_Clf.predict(X_test_rand)
    scores.append(f1_score(y_test_rand, preds))

    
print(f"평균: {np.average(scores)}")
print(f"최소: {min(scores)}")
print(f"최대: {max(scores)}")

0:	learn: 0.5455044	total: 15.6ms	remaining: 6.24s
1:	learn: 0.5200570	total: 29.8ms	remaining: 5.93s
2:	learn: 0.5044429	total: 45.9ms	remaining: 6.07s
3:	learn: 0.4850327	total: 61.9ms	remaining: 6.12s
4:	learn: 0.4691664	total: 76ms	remaining: 6s
5:	learn: 0.4502647	total: 88ms	remaining: 5.78s
6:	learn: 0.4388825	total: 98.2ms	remaining: 5.51s
7:	learn: 0.4232263	total: 109ms	remaining: 5.34s
8:	learn: 0.4074117	total: 119ms	remaining: 5.19s
9:	learn: 0.3880694	total: 131ms	remaining: 5.1s
10:	learn: 0.3638065	total: 141ms	remaining: 4.97s
11:	learn: 0.3535822	total: 151ms	remaining: 4.87s
12:	learn: 0.3373083	total: 161ms	remaining: 4.78s
13:	learn: 0.3267474	total: 177ms	remaining: 4.88s
14:	learn: 0.3130080	total: 187ms	remaining: 4.81s
15:	learn: 0.2869952	total: 198ms	remaining: 4.75s
16:	learn: 0.2750041	total: 209ms	remaining: 4.7s
17:	learn: 0.2666654	total: 219ms	remaining: 4.64s
18:	learn: 0.2544851	total: 230ms	remaining: 4.6s
19:	learn: 0.2395604	total: 241ms	remaining:

In [18]:
# preds = Voting_Clf.predict(test_x)

# F1_Score

In [19]:
# f1_temp = train_y['Y_Class'].values
# f1_temp[f1_temp==2]=0
# f1_temp

In [20]:
# f1_score(f1_temp, preds)

## Submit

In [21]:
submit = pd.read_csv('./sample_submission.csv')

In [22]:
submit['Y_Class'] = preds

ValueError: Length of values (120) does not match length of index (310)

In [None]:
submit.to_csv('./baseline_submission.csv', index=False)