In [1]:
import numpy as np
import pandas as pd
import warnings
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from colorama import Fore, Back, Style
from sklearn.preprocessing import StandardScaler
import itertools
from tqdm.auto import tqdm
from collections import defaultdict
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.model_selection import StratifiedKFold, GroupKFold, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from scipy.stats import pearsonr, spearmanr, rankdata

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
test = pd.read_csv('./input/test.csv')
submission = pd.read_csv('./input/sample_submission.csv')

## Pre-processing

In [3]:
data = test.copy()
data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
data['loading'] = np.log1p(data['loading'])

In [4]:
feature = [f for f in test.columns if f.startswith('measurement') or f=='loading']
fill_dict = {
    'A': ['measurement_5','measurement_6','measurement_8'],
    'B': ['measurement_4','measurement_5','measurement_7'],
    'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
    'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
    'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
    'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
    'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
    'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
    'I': ['measurement_3','measurement_7','measurement_8']
}

for code in data.product_code.unique():
    tmp = data[data.product_code==code]
    column = fill_dict[code]
    tmp_train = tmp[column+['measurement_17']].dropna(how='any')
    tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp['measurement_17'].isnull())]
    print(f"code {code} has {len(tmp_test)} samples to fill nan")
    model = HuberRegressor()
    model.fit(tmp_train[column], tmp_train['measurement_17'])
    data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data['measurement_17'].isnull()), 'measurement_17'] = model.predict(tmp_test[column])

    model2 = KNNImputer(n_neighbors=5)
    print(f"KNN imputing code {code}")
    data.loc[data.product_code==code, feature] = model2.fit_transform(data.loc[data.product_code==code, feature])

code F has 420 samples to fill nan
KNN imputing code F
code G has 373 samples to fill nan
KNN imputing code G
code H has 361 samples to fill nan
KNN imputing code H
code I has 377 samples to fill nan
KNN imputing code I


In [6]:
test = data.copy()

## Model and util funciton

In [8]:
def get_scaler(train_data, feats):
    scaler = StandardScaler()
    # scaler = PowerTransformer()
    
    scaled_train = scaler.fit(train_data[feats])

    
    #back to dataframe
    new_train = train_data.copy()

    
    new_train[feats] = scaled_train

    
    assert len(train_data) == len(new_train)

    return scaler

def apply_transform(data, scaler, feats):
    
    #back to dataframe
    scaled_data = scaler.transform(data[feats])
    new_data = data.copy()
    
    
    new_data [feats] = scaled_data

    
    assert len(new_data) == len(data)
    return new_data

In [9]:
class TPSSolver:
    def __init__(self):
        self.models = []
        self.model_feature_importance_lists = []
        self.standScalers = []
    def train(self, X, y, k_fold_split_nums = 5, select_feature = None):
        lr_oof_1 = np.zeros(len(train))
        lr_oof_2 = np.zeros(len(train))
        lr_auc = 0
        lr_acc = 0
        kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
            print("Fold:", fold_idx+1)
            x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            #x_test = test.copy()

            #x_train, x_val = _scale(x_train, x_val, select_feature)
            scaler = get_scaler(x_train, select_feature)
            self.standScalers.append(scaler)
            x_train = apply_transform(x_train,scaler,  select_feature)
            x_val =  apply_transform( x_val, scaler, select_feature)
            model = LogisticRegression(max_iter=1000, C=0.0001, penalty='l2', solver='newton-cg') # , class_weight='balanced'
            model.fit(x_train[select_feature], y_train)
            self.models.append((model, select_feature))
            self.model_feature_importance_lists.append(model.coef_.ravel())

            val_preds = model.predict_proba(x_val[select_feature])[:, 1]
            lr_auc += roc_auc_score(y_val, val_preds) / 5
            y_preds = model.predict(x_val[select_feature])
            lr_acc += accuracy_score(y_val, y_preds) / 5
            lr_oof_1[val_idx] = val_preds
            lr_oof_2[val_idx] = y_preds
        
        print(f"{Fore.GREEN}{Style.BRIGHT}Average auc = {round(lr_auc, 5)}, Average acc = {round(lr_acc, 5)}{Style.RESET_ALL}")
        print(f"{Fore.RED}{Style.BRIGHT}OOF auc = {round(roc_auc_score(y, lr_oof_1), 5)}, OOF acc = {round(accuracy_score(y, lr_oof_2), 5)}{Style.RESET_ALL}")
    
    def visualization_importance_scores():
        importance_list =  self.model_feature_importance_lists
        importance_df = pd.DataFrame(np.array(importance_list).T, index=x_train[select_feature].columns)
        importance_df['mean'] = importance_df.mean(axis=1).abs()
        importance_df['feature'] = x_train[select_feature].columns
        importance_df = importance_df.sort_values('mean', ascending=False).reset_index().head(20)
        plt.barh(importance_df.index, importance_df['mean'], color='lightgreen')
        plt.gca().invert_yaxis()
        plt.yticks(ticks=importance_df.index, labels=importance_df['feature'])
        plt.title('LogisticRegression feature importances')
        plt.show()
        
    def inference(self, X):
        lr_inference = np.zeros(len(X))
        for scaler, (model, select_feature) in zip(self.standScalers,self.models):
            x_inference = apply_transform(X, scaler, select_feature)
            lr_inference += model.predict_proba(x_inference[select_feature])[:, 1] / len(self.models)
        return lr_inference
        

## Load model from file and inference

In [12]:
import pickle 
with open('checkpoint.obj', 'rb') as fileObj :
    new_models = pickle.load(fileObj)

In [13]:
for i, model in enumerate(new_models):
    submission[f"lr{i}"] = model.inference(test)

In [14]:
submission['rank0'] = rankdata(submission['lr0'])
submission['rank1'] = rankdata(submission['lr1'])
submission['rank2'] = rankdata(submission['lr2'])
submission['rank3'] = rankdata(submission['lr3'])
submission['failure'] = submission['rank0']*0.2 + submission['rank1']*0.25 + submission['rank2']*0.25 + submission['rank3']*0.3
submission[['id', 'failure']].to_csv('submission.csv', index=False)