In [697]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [698]:
import csv
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import os

from tqdm import tqdm

import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.preprocessing import StandardScaler,PowerTransformer
from imblearn.over_sampling import SMOTE

In [699]:
TRAIN_PATH = "/content/drive/MyDrive/CS_CS20024/Final/train.csv"
TEST_PATH = "/content/drive/MyDrive/CS_CS20024/Final/test.csv"
SAVE_PATH = "/content/drive/MyDrive/CS_CS20024/Final/model"

In [700]:
train_df = pd.read_csv(TRAIN_PATH)
train_df.head()
# 26570 data with 26 features

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [701]:
test_df = pd.read_csv(TEST_PATH)
test_df.head()

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,26570,F,119.57,material_5,material_6,6,4,6,9,6,...,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612
1,26571,F,113.51,material_5,material_6,6,4,11,8,0,...,19.368,12.032,13.998,,12.473,17.468,16.708,14.776,14.102,537.037
2,26572,F,112.16,material_5,material_6,6,4,8,12,4,...,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995
3,26573,F,112.72,material_5,material_6,6,4,8,11,10,...,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301
4,26574,F,208.0,material_5,material_6,6,4,14,16,8,...,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044


## Preprocess ##


### Approach 1 ##


In [702]:
# # preprocess
# x_train_df = train_df.drop(columns='failure').to_numpy()
# y_train_df = train_df['failure'].to_numpy().astype('float64')
# x_test_df = test_df.to_numpy()

# # create a label encoder to turn string column into label
# encoder = preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
# for i in range(x_train_df.shape[1]):
#     if(isinstance(x_train_df[0, i], str)):
#         encoder.fit(x_train_df[:, i].reshape(-1, 1))
#         x_train_df[:, i] = encoder.transform(x_train_df[:, i].reshape(-1, 1)).reshape(1, -1)
#         x_test_df[:, i] = encoder.transform(x_test_df[:, i].reshape(-1, 1)).reshape(1, -1)

In [703]:
# # Create imputer to replace missing values with the mean/median
# x_tmp_train = [] 
# for row in x_train_df:
#     if(pd.isnull(row).sum() == 0):
#         x_tmp_train.append(row)
# for row in x_test_df:
#     if(pd.isnull(row).sum() == 0):
#         x_tmp_train.append(row)
# x_tmp_train = np.array(x_tmp_train)

# for col in range(x_train_df.shape[1]):
#     model = HuberRegressor(epsilon=1.9, max_iter=2000)
#     model.fit(np.delete(x_tmp_train, col, 1), x_tmp_train[:, col])
#     for i, row in enumerate(x_train_df):
#         # if only one of the column is null, fill the null with huber regressor
#         if(pd.isnull(row).sum() == 1 and pd.isnull(row[col])):
#             row[col] = model.predict([np.delete(row, col, 0)])[0]
#             # print("modify success")
#     for i, row in enumerate(x_test_df):
#         if(pd.isnull(row).sum() == 1 and pd.isnull(row[col])):
#             row[col] = model.predict([np.delete(row, col, 0)])[0]
#             # print("modify success")

# #imp = SimpleImputer(missing_values=np.nan, strategy='median')
# imp = KNNImputer(n_neighbors=3)
# imp = imp.fit(x_train_df)
# x_train_df = imp.transform(x_train_df)
# x_test_df = imp.transform(x_test_df)

# # split train data into train and valid part
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=22)

In [704]:
# with open('x_train_df.csv', 'w', newline='') as csvfile:
#     writer = csv.writer(csvfile)
#     for row in x_train_df:
#         writer.writerow(row)

### Approach 2 ###

In [705]:
data = pd.concat([train_df, test_df])

In [706]:
feature = [f for f in test_df.columns if f.startswith('measurement') or f == 'loading']

In [707]:
data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
data['area'] = data['attribute_2'] * data['attribute_3']
data['loading'] = np.log(data['loading'])
data['count_null'] = data.isnull().sum(axis=1)

In [708]:
print(data.columns)

Index(['id', 'product_code', 'loading', 'attribute_0', 'attribute_1',
       'attribute_2', 'attribute_3', 'measurement_0', 'measurement_1',
       'measurement_2', 'measurement_3', 'measurement_4', 'measurement_5',
       'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9',
       'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13',
       'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17',
       'failure', 'm3_missing', 'm5_missing', 'area', 'count_null'],
      dtype='object')


In [709]:
full_fill_dict = {}

# collect the name of the 10 best measurement columns sorted by correlation:
col_to_drop = [col for col in test_df.columns if 'measurement' not in col] + ['loading']#,'m3_missing','m5_missing']
# print(col_to_drop)
# only calculate correlation on measurement columns, 'failure', 'area', 'count_null'
# drop 'id', 'product_code', 'loading', 'attribute_0', 'attribute_1', 'attribute_2', 'attribute_3', 'loading', 'm3_missing', 'm5_missing'

corr_total = []
tar_column =[]
for x in range(3,18):
    corr = np.absolute(data.drop(col_to_drop, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
    # add up the 1~5 lines of the correlation values to get the "4 most correlated"
    # skip corr[0] since it will be the target column itself
    corr_total.append(np.round(np.sum(corr[1:5]),3)) 
    # print(corr_total)
    tar_column.append(f'measurement_{x}')
c = pd.DataFrame()
c['Selected columns'] = tar_column
c['correlation total'] = corr_total

In [710]:
c = c.sort_values(by = 'correlation total',ascending=False).reset_index(drop = True)
print(c)

   Selected columns  correlation total
0    measurement_17              1.431
1    measurement_11              0.501
2     measurement_8              0.465
3     measurement_5              0.394
4     measurement_6              0.374
5    measurement_10              0.360
6    measurement_15              0.359
7     measurement_7              0.346
8     measurement_4              0.339
9    measurement_16              0.302
10   measurement_14              0.267
11   measurement_13              0.227
12    measurement_9              0.209
13   measurement_12              0.183
14    measurement_3              0.100


In [711]:
# calculating top 4 correlated features for each measurement column w.r.t each product code
# only choose those with corelation total > 0.3 (which is index 0~9)
for i in range(10):
    measurement_col = c.iloc[i,0] # we select the next best correlated column 
    fill_dict = {}
    for code in data.product_code.unique(): 
        corr = np.absolute(data[data.product_code == code].drop(col_to_drop, axis=1).corr()[measurement_col]).sort_values(ascending=False)
        fill_dict[code] = corr[1:5].index.tolist()

    full_fill_dict[measurement_col] = fill_dict

In [712]:
print(full_fill_dict.keys())

dict_keys(['measurement_17', 'measurement_11', 'measurement_8', 'measurement_5', 'measurement_6', 'measurement_10', 'measurement_15', 'measurement_7', 'measurement_4', 'measurement_16'])


In [713]:
# Create imputer to replace missing values with the HuberRegressor & KNNImputer
# If all the correlated columns of target column is not null, use HuberRegressor to estimate the null value
# else use KNNImputer

for code in data.product_code.unique():
    # all the correlated columns of target column is not null
    for tar_col in list(full_fill_dict.keys()):
        tmp = data[data.product_code == code]
        column = full_fill_dict[tar_col][code]
        hr_train = tmp[column + [tar_col]].dropna(how='any')
        hr_test = tmp[(tmp[column].isnull().sum(axis=1) == 0) & (tmp[tar_col].isnull())]

        hr_model = HuberRegressor(epsilon=2, max_iter=1000)
        hr_model.fit(hr_train[column], hr_train[tar_col])
        data.loc[(data.product_code == code) & (data[column].isnull().sum(axis=1) == 0) & (data[tar_col].isnull()), tar_col] = hr_model.predict(hr_test[column])
        
    knn_imp = KNNImputer(n_neighbors=3)
    data.loc[data.product_code == code, feature] = knn_imp.fit_transform(data.loc[data.product_code == code, feature])

In [714]:
df_train = data.iloc[:train_df.shape[0],:]
df_test = data.iloc[train_df.shape[0]:,:]

## Train ##

In [715]:
def scale(train_data, val_data, test_data, feats):
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(train_data[feats])
    scaled_val = scaler.transform(val_data[feats])
    scaled_test = scaler.transform(test_data[feats])
    new_train = train_data.copy()
    new_val = val_data.copy()
    new_test = test_data.copy()
    new_train[feats] = scaled_train
    new_val[feats] = scaled_val
    new_test[feats] = scaled_test
    return new_train, new_val, new_test

In [716]:
use_feature = ['loading', 'measurement_17', 'm3_missing', 'm5_missing']

In [717]:
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=22)
y_oof = np.zeros(df_train[use_feature].shape[0])
# print(y_test.shape)
logistic_auc = 0
ix = 0
log_model = []

sm = SMOTE(random_state=79, n_jobs=-1) #35

for train_ind, val_ind in skf.split(df_train[feature], df_train[['failure']]):
    print("Fold ", ix, ": ")    
    
    x_train = df_train[use_feature].iloc[train_ind].reset_index(drop=True)
    x_valid = df_train[use_feature].iloc[val_ind].reset_index(drop=True)

    y_train = df_train['failure'].iloc[train_ind].reset_index(drop=True)
    y_valid = df_train['failure'].iloc[val_ind].reset_index(drop=True)
    
    x_train, x_valid, x_test = scale(x_train, x_valid, df_test[use_feature], use_feature)
    x_train, y_train = sm.fit_resample(x_train, y_train)

    clf = LogisticRegression(max_iter=500, C=0.0001, penalty='l2',solver='newton-cg') # 0.0001
    
    # train model
    clf.fit(x_train, y_train)
    log_model.append(clf)
    
    # test valid
    y_pred = clf.predict_proba(x_valid)[:,1]
    roc_score = roc_auc_score(y_valid, y_pred)
    logistic_auc += roc_score/N_FOLDS
    y_oof[val_ind] = y_oof[val_ind] + y_pred
    print('accuracy:', round(roc_score, 5))

    # save processed x test data 
    with open(f'{SAVE_PATH}/processed_test{ix}.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(x_test.columns)
        for row in x_test.to_numpy():
            writer.writerow(row)

    ix += 1

Fold  0 : 
accuracy: 0.58788
Fold  1 : 
accuracy: 0.59394
Fold  2 : 
accuracy: 0.59568
Fold  3 : 
accuracy: 0.58915
Fold  4 : 
accuracy: 0.58263


In [718]:
# test the total kfold valid accuracy
roc_auc_score(df_train[['failure']],y_oof)

0.5898464825600702

## Save Model ##

In [719]:
for i, model in enumerate(log_model):
    torch.save(model, f'{SAVE_PATH}/log_model_{i}.pt')