In [1]:
import sys; sys.path.append("kuma_utils/")
import os
import sys
import joblib
import numpy as np
import pandas as pd
import gc; gc.enable()
from lightgbm import LGBMClassifier
from sklearn.impute import KNNImputer
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import GaussianNB
from feature_engine.encoding import WoEEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from kuma_utils.preprocessing.imputer import LGBMImputer
from sklearn.linear_model import LogisticRegression, HuberRegressor
import warnings; warnings.filterwarnings("ignore")

In [2]:
path='/home/centos/suhyeok/TPS_8_22/data/'
df_train = pd.read_csv(path+"train.csv")
df_test = pd.read_csv(path+"test.csv")
sub = pd.read_csv(path+"sample_submission.csv")
target, groups = df_train['failure'], df_train['product_code']
df_train.drop('failure',axis=1, inplace = True)

In [3]:
def preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])
    
    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']

    feature = [f for f in df_test.columns if f.startswith('measurement') or f=='loading']

    # dictionnary of dictionnaries (for the 11 best correlated measurement columns), 
    # we will use the dictionnaries below to select the best correlated columns according to the product code)
    # Only for 'measurement_17' we make a 'manual' selection :
    full_fill_dict ={}
    full_fill_dict['measurement_17'] = {
        'A': ['measurement_5','measurement_6','measurement_8'],
        'B': ['measurement_4','measurement_5','measurement_7'],
        'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
        'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
        'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
        'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
        'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
        'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
        'I': ['measurement_3','measurement_7','measurement_8']
    }

    # collect the name of the next 10 best measurement columns sorted by correlation (except 17 already done above):
    col = [col for col in df_test.columns if 'measurement' not in col]+ ['loading','m3_missing','m5_missing']
    a = []
    b =[]
    
    # 가장 corr이 높은 3개를 선택하여 값을 더하고 append
    for x in range(3,17):
        corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
        a.append(np.round(np.sum(corr[1:4]),3))
        b.append(f'measurement_{x}')
    c = pd.DataFrame()
    c['Selected columns'] = b
    c['correlation total'] = a
    c = c.sort_values(by = 'correlation total',ascending=False).reset_index(drop = True)
    print(f'Columns selected by correlation sum of the 3 first rows : ')
    display(c.head(10))
    
    
    # 다음 correlated column을 선택하여 append
    for i in range(10):
        measurement_col = 'measurement_' + c.iloc[i,0][12:] # we select the next best correlated column 
        fill_dict = {}
        for x in data.product_code.unique() : 
            corr = np.absolute(data[data.product_code == x].drop(col, axis=1).corr()[measurement_col]).sort_values(ascending=False)
            measurement_col_dic = {}
            measurement_col_dic[measurement_col] = corr[1:5].index.tolist()
            fill_dict[x] = measurement_col_dic[measurement_col]
        full_fill_dict[measurement_col] =fill_dict

    feature = [f for f in data.columns if f.startswith('measurement') or f=='loading']
    nullValue_cols = [col for col in df_train.columns if df_train[col].isnull().sum()!=0]

    for code in data.product_code.unique():
        total_na_filled_by_linear_model = 0
        print(f'\n-------- Product code {code} ----------\n')
        print(f'filled by linear model :')
        for measurement_col in list(full_fill_dict.keys()):
            tmp = data[data.product_code == code]
            column = full_fill_dict[measurement_col][code]
            tmp_train = tmp[column+[measurement_col]].dropna(how='any')
            tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp[measurement_col].isnull())]

            model = HuberRegressor(epsilon=1.9)
            model.fit(tmp_train[column], tmp_train[measurement_col])
            data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data[measurement_col].isnull()),measurement_col] = model.predict(tmp_test[column])
            print(f'{measurement_col} : {len(tmp_test)}')
            total_na_filled_by_linear_model += len(tmp_test)

        # others NA columns:
        NA = data.loc[data["product_code"] == code,nullValue_cols ].isnull().sum().sum()
        model1 = KNNImputer(n_neighbors=3)
        data.loc[data.product_code==code, feature] = model1.fit_transform(data.loc[data.product_code==code, feature])
        print(f'\n{total_na_filled_by_linear_model} filled by linear model ') 
        print(f'{NA} filled by KNN ')

    data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    df_train = data.iloc[:df_train.shape[0],:]
    df_test = data.iloc[df_train.shape[0]:,:]
    
    # 범주형 데이터 WoE Encoder로 해줌
    woe_encoder = WoEEncoder(variables=['attribute_0'])
    woe_encoder.fit(df_train, target)
    df_train = woe_encoder.transform(df_train)
    df_test = woe_encoder.transform(df_test)

    features = ['loading', 'attribute_0', 'measurement_17', 'measurement_0', 'measurement_1', 'measurement_2', 'area', 'm3_missing', 'm5_missing', 'measurement_avg']
    
    return df_train, df_test, features

def scale(train_data, val_data, test_data, feats):
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(train_data[feats])
    scaled_val = scaler.transform(val_data[feats])
    scaled_test = scaler.transform(test_data[feats])
    new_train = train_data.copy()
    new_val = val_data.copy()
    new_test = test_data.copy()
    new_train[feats] = scaled_train
    new_val[feats] = scaled_val
    new_test[feats] = scaled_test
    return new_train, new_val, new_test

df_train, df_test, features = preprocessing(df_train, df_test)
df_train['failure'] = target

Columns selected by correlation sum of the 3 first rows : 


Unnamed: 0,Selected columns,correlation total
0,measurement_8,0.448
1,measurement_11,0.395
2,measurement_5,0.376
3,measurement_6,0.359
4,measurement_7,0.33
5,measurement_4,0.328
6,measurement_15,0.301
7,measurement_10,0.3
8,measurement_16,0.252
9,measurement_14,0.225



-------- Product code A ----------

filled by linear model :
measurement_17 : 386
measurement_8 : 167
measurement_11 : 225
measurement_5 : 113
measurement_6 : 146
measurement_7 : 153
measurement_4 : 79
measurement_15 : 273
measurement_10 : 209
measurement_16 : 293
measurement_14 : 237

2281 filled by linear model 
1568 filled by KNN 

-------- Product code B ----------

filled by linear model :
measurement_17 : 418
measurement_8 : 165
measurement_11 : 220
measurement_5 : 83
measurement_6 : 106
measurement_7 : 174
measurement_4 : 80
measurement_15 : 294
measurement_10 : 197
measurement_16 : 358
measurement_14 : 330

2425 filled by linear model 
1550 filled by KNN 

-------- Product code C ----------

filled by linear model :
measurement_17 : 391
measurement_8 : 189
measurement_11 : 231
measurement_5 : 141
measurement_6 : 150
measurement_7 : 140
measurement_4 : 108
measurement_15 : 319
measurement_10 : 262
measurement_16 : 343
measurement_14 : 330

2604 filled by linear model 
1740 fill

In [4]:
# Logistic Regression 사용

output = pd.read_csv(path+'sample_submission.csv')

x_train, x_val, x_test = scale(df_train[features], df_train[features], df_test[features], features)

model = LogisticRegression(max_iter=200, C=0.0001, penalty='l2', solver='newton-cg')
model.fit(x_train, target)
output['failure'] = (model.predict_proba(x_test)[:, 1]) * 0.8

model = LGBMClassifier(**{'seed': 42, 'n_jobs': -1, 'lambda_l2': 2, 'metric': "auc", 'max_depth': -1, 'num_leaves': 100, 'boosting': 'gbdt', 'bagging_freq': 10, 'learning_rate': 0.01, 'objective': 'binary', 'min_data_in_leaf': 40, 'num_boost_round': 70, 'feature_fraction': 0.90, 'bagging_fraction': 0.90})
model.fit(x_train, target)
output['failure'] += (model.predict_proba(x_test)[:, 1]) * 0.2

output.to_csv('submission_no_splits_simple_baseline.csv', index=False)



In [5]:
# LGBM 사용

params = {"max_iter": 200, "C": 0.0001, "penalty": "l2", "solver": "newton-cg"}

oof = np.zeros(len(df_train))
test_preds = np.zeros(len(df_test))
for fold, (train_idx, val_idx) in enumerate(StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0).split(df_train, df_train["failure"])):
    x_train, y_train = df_train.loc[train_idx][features], df_train.loc[train_idx]["failure"]
    x_val, y_val = df_train.loc[val_idx][features], df_train.loc[val_idx]["failure"]

    x_train, x_val, x_test = scale(x_train, x_val, df_test, features)
    
    model = LogisticRegression(**params)
    model.fit(x_train, y_train)
    y_pred_1 = model.predict_proba(x_val)[:, 1]
    test_preds_1 = model.predict_proba(df_test[features])[:, 1] / 5
       
    lgb_params = {
        'seed': 42,
        'n_jobs': -1,
        'lambda_l2': 2,
        'metric': "auc",
        'max_depth': -1,
        'num_leaves': 100,
        'boosting': 'gbdt',
        'bagging_freq': 10,
        'learning_rate': 0.01,
        'objective': 'binary',
        'min_data_in_leaf': 40,
        'num_boost_round': 1000,
        'feature_fraction': 0.90,
        'bagging_fraction': 0.90,
    }
    
    model = LGBMClassifier(**lgb_params)
    model.fit(x_train, y_train, eval_set = [(x_val, y_val)], early_stopping_rounds = 30)            
    y_pred_2 = model.predict_proba(x_val)[:, 1]
    test_preds_2 = model.predict_proba(df_test[features])[:, 1] / 5
        
    model = GaussianNB(var_smoothing=0.5, priors=[len(y_train[y_train == 0]) / len(y_train), len(y_train[y_train == 1])/len(y_train)])
    model.fit(x_train, y_train)
    y_pred_3 = model.predict_proba(x_val)[:, 1]
    test_preds_3 = model.predict_proba(x_test[features])[:, 1] / 5
    
    oof[val_idx] = (y_pred_1 * 1.0)     + (0.0 * y_pred_2) + (0.0 * y_pred_3)
    test_preds   = (test_preds_1 * 1.0) + (0.0 * test_preds_2) + (0.0 * test_preds_3)
    
    print(f"Val score: {roc_auc_score(y_val, oof[val_idx]):.7f}")

print(f"Val score: {roc_auc_score(df_train['failure'], oof):.7f}")

[1]	valid_0's auc: 0.569433
[2]	valid_0's auc: 0.563908
[3]	valid_0's auc: 0.567755
[4]	valid_0's auc: 0.568923
[5]	valid_0's auc: 0.56636
[6]	valid_0's auc: 0.567797
[7]	valid_0's auc: 0.568454
[8]	valid_0's auc: 0.571446
[9]	valid_0's auc: 0.571361
[10]	valid_0's auc: 0.572187
[11]	valid_0's auc: 0.574881
[12]	valid_0's auc: 0.576576
[13]	valid_0's auc: 0.577995
[14]	valid_0's auc: 0.57954
[15]	valid_0's auc: 0.579745
[16]	valid_0's auc: 0.579683
[17]	valid_0's auc: 0.580048
[18]	valid_0's auc: 0.580346
[19]	valid_0's auc: 0.581234
[20]	valid_0's auc: 0.581745
[21]	valid_0's auc: 0.581478
[22]	valid_0's auc: 0.580936
[23]	valid_0's auc: 0.5804
[24]	valid_0's auc: 0.580054
[25]	valid_0's auc: 0.579773
[26]	valid_0's auc: 0.58033
[27]	valid_0's auc: 0.579957
[28]	valid_0's auc: 0.580498
[29]	valid_0's auc: 0.580742
[30]	valid_0's auc: 0.581153
[31]	valid_0's auc: 0.581623
[32]	valid_0's auc: 0.581782
[33]	valid_0's auc: 0.581691
[34]	valid_0's auc: 0.581397
[35]	valid_0's auc: 0.580919

[64]	valid_0's auc: 0.564943
[65]	valid_0's auc: 0.565067
[66]	valid_0's auc: 0.565033
[67]	valid_0's auc: 0.564865
[68]	valid_0's auc: 0.564768
[69]	valid_0's auc: 0.564655
[70]	valid_0's auc: 0.564596
[71]	valid_0's auc: 0.564825
[72]	valid_0's auc: 0.565012
[73]	valid_0's auc: 0.564831
[74]	valid_0's auc: 0.564769
[75]	valid_0's auc: 0.564507
[76]	valid_0's auc: 0.564673
[77]	valid_0's auc: 0.564503
[78]	valid_0's auc: 0.564344
[79]	valid_0's auc: 0.564199
[80]	valid_0's auc: 0.564374
[81]	valid_0's auc: 0.564421
[82]	valid_0's auc: 0.56472
[83]	valid_0's auc: 0.564794
[84]	valid_0's auc: 0.564923
[85]	valid_0's auc: 0.565322
[86]	valid_0's auc: 0.565499
[87]	valid_0's auc: 0.565661
[88]	valid_0's auc: 0.565749
[89]	valid_0's auc: 0.565836
[90]	valid_0's auc: 0.565764
[91]	valid_0's auc: 0.56554
[92]	valid_0's auc: 0.565697
[93]	valid_0's auc: 0.565649
[94]	valid_0's auc: 0.565664
[95]	valid_0's auc: 0.565602
[96]	valid_0's auc: 0.565667
[97]	valid_0's auc: 0.565923
[98]	valid_0's a

[23]	valid_0's auc: 0.559398
[24]	valid_0's auc: 0.559973
[25]	valid_0's auc: 0.560595
[26]	valid_0's auc: 0.560384
[27]	valid_0's auc: 0.561003
[28]	valid_0's auc: 0.560781
[29]	valid_0's auc: 0.561502
[30]	valid_0's auc: 0.561873
[31]	valid_0's auc: 0.562301
[32]	valid_0's auc: 0.56299
[33]	valid_0's auc: 0.563277
[34]	valid_0's auc: 0.562924
[35]	valid_0's auc: 0.562417
[36]	valid_0's auc: 0.5627
[37]	valid_0's auc: 0.562888
[38]	valid_0's auc: 0.562863
[39]	valid_0's auc: 0.562968
[40]	valid_0's auc: 0.56292
[41]	valid_0's auc: 0.56321
[42]	valid_0's auc: 0.563487
[43]	valid_0's auc: 0.563622
[44]	valid_0's auc: 0.563427
[45]	valid_0's auc: 0.563236
[46]	valid_0's auc: 0.563351
[47]	valid_0's auc: 0.56373
[48]	valid_0's auc: 0.563408
[49]	valid_0's auc: 0.56333
[50]	valid_0's auc: 0.563868
[51]	valid_0's auc: 0.563943
[52]	valid_0's auc: 0.563569
[53]	valid_0's auc: 0.564013
[54]	valid_0's auc: 0.56406
[55]	valid_0's auc: 0.563879
[56]	valid_0's auc: 0.564022
[57]	valid_0's auc: 0.

In [6]:
output = pd.read_csv(path+'sample_submission.csv')
output['failure'] = test_preds
output.to_csv('submission_simple_baseline.csv', index=False)

In [9]:
data_root = '/home/centos/suhyeok/TPS_8_22/'
p_sub = pd.read_csv(data_root + 'submission_no_splits_simple_baseline.csv')
p_sub

Unnamed: 0,id,failure
0,26570,0.207435
1,26571,0.197102
2,26572,0.199230
3,26573,0.198658
4,26574,0.254901
...,...,...
20770,47340,0.222869
20771,47341,0.185111
20772,47342,0.188905
20773,47343,0.213448


In [18]:
# 확률이 0.27보다 높으면 1로 설정
p_sub[p_sub['failure']>0.27]
sub[sub['failure'] > 0.27] = 1.0

Unnamed: 0,id,failure
66,26636,0.273101
276,26846,0.273529
357,26927,0.270338
455,27025,0.278561
503,27073,0.273345
...,...,...
20457,47027,0.302485
20504,47074,0.270885
20594,47164,0.277961
20615,47185,0.271746


In [17]:
# 적으면 0으로 설정
p_sub[p_sub['failure']<0.17]
sub[sub['failure'] < 0.17] = 0.

Unnamed: 0,id,failure
2769,29339,0.169121
3042,29612,0.169813
6011,32581,0.168953
6247,32817,0.169461
7200,33770,0.169516
8161,34731,0.168226
8606,35176,0.169361
9263,35833,0.168502
9578,36148,0.16514
11065,37635,0.169728


In [13]:
p_sub[p_sub['failure']<0.24]

Unnamed: 0,id,failure
0,26570,0.207435
1,26571,0.197102
2,26572,0.199230
3,26573,0.198658
5,26575,0.193120
...,...,...
20770,47340,0.222869
20771,47341,0.185111
20772,47342,0.188905
20773,47343,0.213448
