In [47]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


In [48]:
import os, random, json
import itertools
from collections import OrderedDict

import numpy as np
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score

import seaborn as sns
sns.set_theme()
import matplotlib.pyplot as plt
%matplotlib inline

In [49]:
import tensorflow as tf
import tensorflow_probability as tfp

tfpl = tfp.layers
tfpd = tfp.distributions
tfpb = tfp.bijectors

print('TF version:', tf.__version__)
print('TFP version:', tfp.__version__)

TF version: 2.11.0
TFP version: 0.19.0


In [50]:
def build_preprocessor():
    """Only numerical"""
    
    columns = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN', 'BP', 'BQ', 'BR', 'BZ', 'CB', 
                'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 
                'DU', 'DV', 'DY', 'EB', 'EE', 'EG', 'EH', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI', 'FL', 'FR', 
                'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL']
    
    mask = {'0': False, '1': True, '10': True, '11': False, '12': True, '13': False, '14': True, '15': True, '16': True, 
            '17': True, '18': False, '19': True, '2': False, '20': False, '21': False, '22': False, '23': True, '24': False, 
            '25': False, '26': True, '27': True, '28': False, '29': True, '3': True, '30': False, '31': True, '32': True, 
            '33': True, '34': True, '35': True, '36': True, '37': True, '38': False, '39': True, '4': False, '40': True, 
            '41': False, '42': False, '43': False, '44': True, '45': True, '46': True, '47': True, '48': False, '49': True, 
            '5': True, '50': True, '51': True, '52': True, '53': True, '54': False, '6': False, '7': False, '8': True, '9': False}
    
    columns = [col for i, col in enumerate(columns) if mask[str(i)]]
    
    extractor = FunctionTransformer(lambda df: df[columns].values)
    
    #scaler = StandardScaler()
    scaler = make_pipeline(
        FunctionTransformer(lambda vals: vals - np.nanmin(vals, axis=0, keepdims=True) + 1),
        PowerTransformer(method='box-cox', standardize=True)
    )
    
    missing = FunctionTransformer(
        lambda x: np.concatenate([np.nan_to_num(x), np.isnan(x)], axis=1).astype('float32')
    )  

    preprocessor  = make_pipeline(extractor, scaler, missing)
    
    return preprocessor


In [51]:
def prediction_loss(targets, y_pred, eps=1e-5):

    y_true = targets[:,-1:]
    
    y = tf.cast(tf.math.maximum(y_true, 0), tf.float32) 
    p = tf.keras.backend.clip(y_pred, eps, 1-eps)

    mask0 = tf.cast(y_true == 0, tf.float32)
    mask1 = tf.cast(y_true == 1, tf.float32)
    
    n0 = tf.math.reduce_sum(mask0) + eps
    n1 = tf.math.reduce_sum(mask1) + eps

    loss_y0 = tf.math.reduce_sum( mask0 * (1 - y) * tf.math.log(1 - p) ) / n0
    loss_y1 = tf.math.reduce_sum( mask1 * y * tf.math.log(p) ) / n1
    loss_nll = -0.5 * (loss_y0 + loss_y1)
     
    return loss_nll


def entropy_loss(targets, y_pred, eps=1e-5):

    y_true = targets[:,-1:]

    p = tf.keras.backend.clip(y_pred, eps, 1-eps)

    mask_1 = tf.cast(y_true == -1, tf.float32)    

    n_1 = tf.math.reduce_sum(mask_1) + eps
    
    entropy = -p * tf.math.log(p)
    loss_entropy = tf.math.reduce_sum(mask_1 * entropy) / n_1
     
    return loss_entropy

In [52]:
def cutmix_fn(t, pmix=0.7):
    shape = tf.shape(t)
    mask = tf.random.uniform(shape) < pmix
    n = tf.random.uniform([1], minval=0, maxval=shape[1], dtype=tf.dtypes.int32)[0] #-1
    return tf.where(mask, t, tf.roll(t, -n, axis=0))

def contrastive_activation(h, h1, tau=0.7):
    dist = tf.matmul(h, h1, transpose_b=True) / tau
    dist_scaled = tf.keras.activations.softmax(dist)
    dist_scaled_diag = tf.linalg.diag_part(dist_scaled)
    return tf.reshape(dist_scaled_diag, [-1,1])


class SemisupervisedNet(tf.keras.Model):
    
    def __init__(self, n_features, n_outputs=1, n_hidden=64, rate=0.3, l2=1e-3, semi_dim=8, pmix=0.7, tau=0.7):
        super().__init__()             

        self.embedding = tf.keras.layers.Embedding(input_dim=1, output_dim=n_features)        
        
        self.dense = tf.keras.layers.Dense(n_hidden, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(l2))
        self.dropout = tf.keras.layers.Dropout(rate)
        self.prediction = tf.keras.layers.Dense(n_outputs, activation='sigmoid')

        self.dense1 = tf.keras.layers.Dense(semi_dim)
        self.cutmix = tf.keras.layers.Lambda(lambda t: cutmix_fn(t, pmix=pmix), name='cutmix')        
        self.contrastive = tf.keras.layers.Lambda(lambda lst: contrastive_activation(lst[0], lst[1], tau=tau), name='contrastive')  

    def treat_missing(self, x, mask):
        """treat missing values"""
        batch_size = tf.shape(x)[0] 
        idx = tf.zeros([batch_size], tf.int32)          
        x_feat = (1-mask) * x + mask * self.embedding(idx) 
        return x_feat
        
    def call(self, inputs):  

        feat, mask = tf.split(inputs, num_or_size_splits=2, axis=1)

        x = self.treat_missing(feat, mask)        
        x1 = self.cutmix(x)

        h =  self.dense(x)
        h1 =  self.dense(x1) 

        y = self.prediction(self.dropout(h))
        y1 = self.contrastive([self.dense1(h), self.dense1(h1)])

        outputs = {
            'prediction': y,
            'entropy': y,
            'contrastive': y1,
        }

        return outputs  

In [53]:
def fit_semisupervised(params, x_train, y_train, validation_data=None):

    optimizer = tf.optimizers.Adam(learning_rate=params['lr'])
    
    loss = {
        'prediction': prediction_loss,
        'entropy': entropy_loss,
        'contrastive': lambda t, p: -tf.math.log(tf.keras.backend.clip(p, 1e-5, 1)),
    }
    
    loss_weights = {key: params[key] for key in ['prediction', 'entropy', 'contrastive']}

    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_prediction_loss', patience=params['patience'], restore_best_weights=True)]

    model = SemisupervisedNet(n_features=x_train.shape[1]//2, n_hidden=params['n_hidden'], rate=params['rate'], l2=params['l2'])

    model.compile(optimizer=optimizer, loss=loss, loss_weights=loss_weights) 

    model.fit(x_train, y_train, validation_data=validation_data, callbacks=callbacks, verbose=params['verbose'], 
                epochs=params['epochs'], batch_size=params['batch_size'])

    return model

In [54]:
import os, random

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold

import tensorflow as tf


PATH_DATA = '/kaggle/input/icr-identify-age-related-conditions/'
SEED = 4241


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed) 


def load_data(folder):
    """Load data"""
    greeks_df = pd.read_csv(folder + 'greeks.csv')
    train_df = pd.read_csv(folder + 'train.csv')    
    test_df = pd.read_csv(folder + 'test.csv')
    test_df['Class'] = -1
    df = pd.concat([train_df, test_df])
    return df, train_df, test_df, greeks_df   


def get_data():
    df, train_df, test_df, greeks_df = load_data(PATH_DATA)

    preprocessor = build_preprocessor()
    XX = preprocessor.fit_transform(df)
    YY = df[['Class']].values

    X, X_unlabel = XX[:len(train_df)], XX[len(train_df):]
    Y, Y_unlabel = YY[:len(train_df)], YY[len(train_df):] 

    return X, X_unlabel, Y, Y_unlabel


def balanced_log_loss(y_true, y_pred, eps=1e-5): 
    y_true, y_pred = y_true.ravel(), y_pred.ravel()
    # calculate the number of observations for each class
    n0, n1 = np.bincount(y_true.astype(int))
    # clip probabilities    
    y_pred = np.clip(y_pred, eps, 1-eps)
    # calculate balanced logarithmic loss
    log_loss = np.sum((1-y_true) * np.log(1-y_pred)) / n0 + np.sum(y_true * np.log(y_pred)) / n1
    return -0.5 * log_loss


PARAMS = {'epochs': 200, "patience": 20, "verbose": 0, 
          'prediction': 0.9422988091592843, 'contrastive': 0.3933141976436182, 'entropy': 0,# loss weights
          'n_hidden': 64, 'batch_size': 28, 'lr': 0.01, 'l2': 0.0012, 'rate': 0.25}
    
def evaluate(params_, X, X_unlabel, Y, Y_unlabel): 
    """Computes CV score for semisupervised net"""

    params = {**PARAMS, **params_}

    tf.keras.backend.clear_session()
    set_seed(SEED)
    scores = []    
    models = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=242).split(X, Y.ravel())    
    for k, (train_index, test_index) in enumerate(skf):
        
        x_test = X[test_index]  
        y_test = Y[test_index]
        
        x_train = np.concatenate([X[train_index], X_unlabel, x_test], axis=0)
        y_train = np.concatenate([Y[train_index], Y_unlabel, -1*np.ones(y_test.shape)], axis=0)
        
        model = fit_semisupervised(params, x_train, y_train, validation_data=(x_test, y_test))
        
        y_pred = model.predict(x_test)['prediction']
        
        score = balanced_log_loss(y_test, y_pred)
        scores.append(score)
        
        models.append(model)
        
        #print('Fold:', k, 'Score:', score)
    #print('CV score:', np.mean(scores))

    return np.mean(scores), models

In [55]:
X, X_unlabel, Y, Y_unlabel = get_data()
params = {}

score, models = evaluate(params, X, X_unlabel, Y, Y_unlabel)
print(score)

0.1951446683219618


In [56]:
preds = [model.predict(X_unlabel)['prediction'] for model in models]
y_pred = np.concatenate(preds, axis=1).mean(axis=1, keepdims=True)



In [69]:
test_df = pd.read_csv(PATH_DATA + 'test.csv')
submission = pd.DataFrame(test_df["Id"], columns=["Id"])
submission["class_0"] = 1 - y_pred
submission["class_1"] = y_pred
display(submission)

submission.to_csv('submission.csv',index=False)

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.999634,0.000366
1,010ebe33f668,0.999634,0.000366
2,02fa521e1838,0.999634,0.000366
3,040e15f562a2,0.999634,0.000366
4,046e85c7cc7f,0.999634,0.000366
