In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


In [2]:
import xgboost
import lightgbm as lgb
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold as KF
from sklearn.metrics import log_loss
from datetime import datetime

train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')
test.EJ = test.EJ.eq(first_category).astype('int')

ID_for_sub = test['Id']

x = train.drop(['Id','Class'], axis=1)
y = train['Class']

test = test.drop(['Id'], axis=1)

def balanced_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)

    w_0 = 1 / N_0
    w_1 = 1 / N_1

    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1

    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))

    balanced_log_loss = 2 * (w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)

    return balanced_log_loss / (N_0 + N_1)

class Ensemble():
    def __init__(self):
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')

        self.classifiers = [
            xgboost.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.2, subsample=0.9, colsample_bytree=0.85),
            lgb.LGBMClassifier(n_estimators=100, max_depth=3, learning_rate=0.2, subsample=0.9, colsample_bytree=0.85),
            xgboost.XGBClassifier(),
            lgb.LGBMClassifier()
        ]

    def fit(self, X, y):
        X = self.imputer.fit_transform(X)

        for classifier in self.classifiers:
            if classifier in (self.classifiers[2], self.classifiers[3]):
                classifier.fit(X, y)
            else:
                classifier.fit(X, y)

    def predict_proba(self, x):
        x = self.imputer.transform(x)

        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0)

        class_0_est_instances = averaged_probabilities[:, 0].sum()
        others_est_instances = len(averaged_probabilities) - class_0_est_instances

        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1)

def training(model, x, y, y_meta):
    outer_results = list()
    best_loss = np.inf
    split = 0
    splits = 5
    cv_inner = KF(n_splits=splits, shuffle=True, random_state=42)

    for train_idx, val_idx in tqdm(cv_inner.split(x), total=splits):
        split += 1
        x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
        y_train, y_val = y_meta[train_idx], y.iloc[val_idx]
        model.fit(x_train, y_train)
        y_pred = model.predict_proba(x_val)
        p0 = y_pred[:, 0]
        p0 = np.where(p0 >= 0.5, 0, 1)
        p0 = p0.reshape(len(p0))
        loss = balanced_log_loss(y_val, p0)
        if loss < best_loss:
            best_model = model
            best_loss = loss
            print('best_model_saved')
        outer_results.append(loss)
        print('>val_loss=%.5f, split=%.1f' % (loss, split))

    print('LOSS: %.5f' % (np.mean(outer_results)))
    return best_model


times = greeks.Epsilon.copy()
times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks.Epsilon == 'Unknown'] = np.nan

train['Epsilon'] = times
test['Epsilon'] = max(times) + 1

ros = RandomOverSampler(random_state=42)
train_ros, y_ros = ros.fit_resample(train, greeks.Alpha)

_, y_ros = np.unique(y_ros, return_inverse=True)

x_ros = train_ros.drop(['Class', 'Id'], axis=1)
y_ = train_ros.Class

yt = Ensemble()
m = training(yt, x_ros, y_, y_ros)
y_pred = m.predict_proba(test)

p0 = y_pred[:, 0]

p0[p0 > 0.62] = 1
p0[p0 < 0.26] = 0

submission = pd.DataFrame(ID_for_sub, columns=["Id"])
submission["class_0"] = p0
submission["class_1"] = 1 - p0

submission.to_csv('submission.csv', index=False)




  0%|          | 0/5 [00:00<?, ?it/s]

best_model_saved
>val_loss=0.24567, split=1.0
>val_loss=0.24771, split=2.0
best_model_saved
>val_loss=0.00000, split=3.0
>val_loss=0.26439, split=4.0
>val_loss=0.26773, split=5.0
LOSS: 0.20510
