## Import

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report, recall_score, accuracy_score



# Variables

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [3]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

# Create Train&Test sets of data

In [4]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df[['Y_Class']]

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])



In [5]:
train_y.value_counts()

Y_Class
1          407
2          103
0           88
dtype: int64

## Data Pre-processing

In [6]:
all_nan_columns = train_x.loc[:,(train_x.isna() == True ).all(axis=0)].columns
# train_x = train_x -train_x.loc[:,(train_x.isna() == True ).all(axis=0)]
train_x = train_x.drop(columns=all_nan_columns )
test_x = test_x.drop(columns=all_nan_columns)
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
3,T010305,A_31,,,,,,,,,...,183.8,467.0,444.192308,423.0,,,,,,
4,T010306,A_31,,,,,,,,,...,179.7,465.0,443.211539,423.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
306,T100304,T_31,2.0,96.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
307,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
308,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,


In [7]:
train_x_mean = train_x.fillna(train_x.mean())
test_x_mean = test_x.fillna(train_x.mean())

train_x_zero = train_x.fillna(0)
test_x_zero = test_x.fillna(0)

test_x_mean

  train_x_mean = train_x.fillna(train_x.mean())
  test_x_mean = test_x.fillna(train_x.mean())


Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871
0,T100306,T_31,2.000000,94.000000,0.0,45.0,10.00000,0.0,51.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
1,T100304,T_31,2.000000,93.000000,0.0,45.0,11.00000,0.0,45.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
2,T100304,T_31,2.000000,95.000000,0.0,45.0,11.00000,0.0,45.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
3,T010305,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,183.800000,467.000000,444.192308,423.000000,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
4,T010306,A_31,2.409742,95.123209,0.0,45.0,10.39255,0.0,48.802292,10.048711,...,179.700000,465.000000,443.211539,423.000000,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.000000,91.000000,0.0,45.0,10.00000,0.0,51.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
306,T100304,T_31,2.000000,96.000000,0.0,45.0,11.00000,0.0,45.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
307,T100306,T_31,2.000000,91.000000,0.0,45.0,10.00000,0.0,50.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
308,T100306,T_31,2.000000,95.000000,0.0,45.0,10.00000,0.0,51.000000,10.000000,...,163.290763,423.558233,406.088187,388.064257,50.8073,53.6077,49.6062,51.6598,66.6497,1.0


In [8]:
test_x_mean.isna().values.any()

False

# Model Fit Variables

In [9]:
MODEL_TEST_REPEAT=5
TRAINING_DATA=train_x_mean
TESTING_DATA=test_x_mean

# Label Encoding

In [10]:
# qualitative to quantitative
qual_col = ['PRODUCT_CODE','LINE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(TRAINING_DATA[i])
    TRAINING_DATA[i] = le.transform(TRAINING_DATA[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    TESTING_DATA[i] = le.transform(TESTING_DATA[i]) 
print('Done.')

Done.


# Checking if imbalanced

In [11]:
print(f"yclass 0 in tarin_df : {train_df[train_df['Y_Class']==0].shape[0]}")
print(f"yclass 1 in tarin_df : {train_df[train_df['Y_Class']==1].shape[0]}")
print(f"yclass 2 in tarin_df : {train_df[train_df['Y_Class']==2].shape[0]}")

yclass 0 in tarin_df : 88
yclass 1 in tarin_df : 407
yclass 2 in tarin_df : 103


In [12]:
from sklearn.neighbors import KNeighborsClassifier as KNN
KNN_model = KNN(n_neighbors=11)
KNN_model.fit(TRAINING_DATA, train_y)
pred_knn = KNN_model.predict(TRAINING_DATA)
# print(recall_score(train_y, pred_knn))
# print(accuracy_score(train_y, pred_knn))
print(classification_report(train_y, pred_knn))

  return self._fit(X, y)


              precision    recall  f1-score   support

           0       0.56      0.33      0.41        88
           1       0.77      0.93      0.84       407
           2       0.46      0.22      0.30       103

    accuracy                           0.72       598
   macro avg       0.59      0.50      0.52       598
weighted avg       0.68      0.72      0.69       598



# Data OverSampling by imblearn

In [13]:
from imblearn.over_sampling import SMOTE

oversampling_instance = SMOTE(k_neighbors=3)

o_Train_X, o_Train_Y = oversampling_instance.fit_resample(TRAINING_DATA, train_y)

o_Train_X = pd.DataFrame(o_Train_X, columns=o_Train_X.columns)
# o_Train_Y = pd.Series(o_Train_Y)

o_Train_Y.value_counts()

Y_Class
0          407
1          407
2          407
dtype: int64

In [14]:
# d = {
#     'columns': train_x_del_zeros.columns.values,
#     'importances': RF.feature_importances_,
# }

# temp_df = pd.DataFrame(d)
# temp_df = temp_df[temp_df['importances'] != 0]
# temp_df.sort_values(by='importances', ascending=False)
# temp_df[temp_df['importances'] > 1/1704]['columns'].values

In [15]:
# import plotly.express as px

# px.line(data_frame=temp_df,
#         x='columns',
#         y='importances')

# Resampling by imortances

In [16]:
# train_x_v2 = train_df[temp_df[temp_df['importances'] > 1/1704]['columns'].values]
# train_x_v2 = train_x_v2.fillna(0)

# test_x_v2 = test_df[temp_df[temp_df['importances'] > 1/1704]['columns'].values]
# test_x_v2 = test_x_v2.fillna(0)


# Classifier Model Fit

# Gradient Boost Model Fit

In [17]:
# target = train_df['Y_Class']

# features = TRAINING_DATA
# scores=[]
# for i in range(MODEL_TEST_REPEAT):
#     X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
#                                                         target, 
#                                                         test_size=0.2, 
#                                                         )

#     GB = GradientBoostingClassifier(learning_rate = 0.05)
#     GB.fit(X_train_rand, y_train_rand)

#     preds = GB.predict(X_test_rand)
#     scores.append(f1_score(y_test_rand, preds,average='macro'))
    
# print(f"평균: {np.average(scores)}")
# print(f"최소: {min(scores)}")
# print(f"최대: {max(scores)}")



In [18]:
# testing0oversampled dataset
# target = o_Train_Y['Y_Class']

# features = o_Train_X
# scores=[]
# for i in range(MODEL_TEST_REPEAT):
#     X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
#                                                         target, 
#                                                         test_size=0.2, 
#                                                         )

#     GB = GradientBoostingClassifier(learning_rate = 0.05)
#     GB.fit(X_train_rand, y_train_rand)

#     preds = GB.predict(X_test_rand)
#     scores.append(f1_score(y_test_rand, preds,average='macro'))
    
# print(f"평균: {np.average(scores)}")
# print(f"최소: {min(scores)}")
# print(f"최대: {max(scores)}")


In [19]:
# print(classification_report(y_test_rand, preds))

# Check if Overfitted

In [20]:
# import matplotlib.pyplot as plt

# def learning_curve(history, epoch):
#   plt.figure(figsize = (10,5))
#   epoch_range = np.arange(1, epoch + 1)

#   plt.subplot(1, 2, 1)

#   plt.plot(epoch_range, history.history['accuracy'])
#   plt.plot(epoch_range, history.history['val_accuracy'])
#   plt.title('Model Accuracy')
#   plt.xlabel('epoch')
#   plt.ylabel('Accuracy')
#   plt.legend(['Train', 'Val'])

#   plt.subplot(1,2,2)

#   plt.plot(epoch_range, history.history['loss'])
#   plt.plot(epoch_range, history.history['val_loss'])
#   plt.title('Model loss')
#   plt.xlabel('epoch')
#   plt.ylabel('loss')
#   plt.legend(['Train', 'Val'])
#   plt.show()

# history = GB

# learning_curve(history, 50)

# GaussianNB Model Fit

In [21]:
# target = train_df['Y_Class']

# features = TRAINING_DATA
# scores=[]
# for i in range(MODEL_TEST_REPEAT):
#     X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
#                                                         target, 
#                                                         test_size=0.2, 
#                                                         )

#     clf = GaussianNB()
#     clf.fit(X_train_rand, y_train_rand)

#     preds = clf.predict(X_test_rand)
#     scores.append(f1_score(y_test_rand, preds,average='macro'))
    
# print(f"평균: {np.average(scores)}")
# print(f"최소: {min(scores)}")
# print(f"최대: {max(scores)}")



# RandomForest Classifier Model Fit

In [22]:
# target = train_df['Y_Class']

# features = TRAINING_DATA
# scores=[]
# for i in range(MODEL_TEST_REPEAT):
#     X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
#                                                         target, 
#                                                         test_size=0.2, 
#                                                         )

#     RF_rand = RandomForestClassifier()
#     RF_rand.fit(X_train_rand, y_train_rand)

#     preds = RF_rand.predict(X_test_rand)
#     scores.append(f1_score(y_test_rand, preds,average='macro'))
    
# print(f"평균: {np.average(scores)}")
# print(f"최소: {min(scores)}")
# print(f"최대: {max(scores)}")



# CatBoost Classifier Model Fit

In [23]:
# target = train_df['Y_Class']


# features = TRAINING_DATA
# scores=[]
# for i in range(MODEL_TEST_REPEAT):
#     X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
#                                                                             target, 
#                                                                             test_size=0.2, 
#                                                                             )

#     catboost_clf = CatBoostClassifier(iterations=40,
#                            depth=2,
#                            learning_rate=0.02,
#                            loss_function='MultiClass',
#                            verbose=True)
    
#     catboost_clf.fit(X_train_rand, y_train_rand)

#     preds = catboost_clf.predict(X_test_rand)
#     scores.append(f1_score(y_test_rand, preds,average='macro'))
    
# print(f"평균: {np.average(scores)}")
# print(f"최소: {min(scores)}")
# print(f"최대: {max(scores)}")

In [24]:
# print(classification_report(y_test_rand, preds))

In [25]:
# grid_parameters = {'depth'         : [6,8,10],
#                     'learning_rate' : [0.01, 0.05, 0.1],
#                     'iterations'    : [30, 50, 100]
#                     }

# grid = GridSearchCV(estimator=catboost_clf, param_grid = grid_parameters, cv = 2, n_jobs=-1)
# grid.fit(X_train_rand, y_train_rand)

# print(" Results from Grid Search " )
# print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
# print("\n The best score across ALL searched params:\n", grid.best_score_)
# print("\n The best parameters across ALL searched params:\n", grid.best_params_)


# XGB Classifier Model Fit

In [26]:
# import xgboost as xgb

# # XGBoost를 사용하기 위해서는 DMatrix 형태로 변환해 주어야 합니다
# dtrain = xgb.DMatrix(train_x, train_y)
# dtest = xgb.DMatrix(test_x)

# # 모델 생성
# # num_boost_round 만큼 반복하는데 early_stopping_rounds 만큼 성능 향상이 없으면 중단
# # early_stopping_rounds를 사용하려면 eval 데이터 셋을 명기해야함
# param = {파라미터 설정}
# xgb_model = xgb.train(params = params, dtrain = dtrain, num_boost_round = 400, 
#                         early_stopping_rounds = 100, evals=[(dtrain,'train'),(dval,'eval')])

# # 예측하기, 확률값으로 반환됨
# y_pre_probs = xgb_model.predict(dtest)

# # 0또는 1로 변경
# y_preds = [1 if x>0.5 else 0 for x in y_pre_probs]

# Voting Classifier Model Fit -> Merging Multiple Models

In [27]:
# target = train_df['Y_Class']
# features = TRAINING_DATA
# scores=[]
# for i in range(MODEL_TEST_REPEAT):
#     X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(features, 
#                                                         target, 
#                                                         test_size=0.2, 
#                                                         )

#     Voting_Clf = VotingClassifier(estimators=[ 
#                                               ('RandomForest', RF_rand) ,
#                                               ('GaussianNB', clf),
#                                                 ('CatBoostClassifier', catboost_clf)],
#                                   voting='soft')
    
#     Voting_Clf.fit(X_train_rand, y_train_rand)
    
    
#     preds = Voting_Clf.predict(X_test_rand)
#     scores.append(f1_score(y_test_rand, preds,average='macro'))

    
# print(f"평균: {np.average(scores)}")
# print(f"최소: {min(scores)}")
# print(f"최대: {max(scores)}")

In [28]:
vot_esti = []
for i in range(10):
    globals()["gb_test_".format(i)] = GradientBoostingClassifier(  n_estimators = 150 ,
                                                                    learning_rate = 0.05 ,
                                                                    random_state = 37
                                                                    )
    globals()["gb_test_".format(i)].fit(o_Train_X, o_Train_Y['Y_Class'])
    vot_esti.append((f"gb_{i}", globals()["gb_test_".format(i)]))
    



In [None]:
# voting with 10 gbs
voting_clf_10gb = VotingClassifier(estimators=vot_esti,
                                  voting='soft')

voting_clf_10gb.fit(o_Train_X, o_Train_Y['Y_Class'])

preds = voting_clf_10gb.predict(TESTING_DATA)

In [None]:
# preds = GB.predict(TESTING_DATA)

# F1_Score

In [None]:
# f1_temp = train_y['Y_Class'].values
# f1_temp[f1_temp==2]=0
# f1_temp

In [None]:
# f1_score(f1_temp, preds)

## Submit

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['Y_Class'] = preds

In [None]:
submit.groupby(['Y_Class']).count()

Unnamed: 0_level_0,PRODUCT_ID
Y_Class,Unnamed: 1_level_1
0,51
1,226
2,33


In [None]:
submit.to_csv('./baseline_submission.csv', index=False)