In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../input/playgroundkfolds/pg_train_folds.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
sample = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

In [None]:
useful = [col for col in train.columns if train[col].dtypes == 'float64']
print(len(useful))

In [None]:
# Add value missing and standard deviations as features
train['num_missing'] = train[useful].isna().sum(axis=1)
train['std_dev'] = train[useful].isna().std(axis=1)

test['num_missing'] = test[useful].isna().sum(axis=1)
test['std_dev'] = test[useful].isna().std(axis=1)

useful += ['num_missing', 'std_dev']

In [None]:
pd.DataFrame({'Mean': train[useful].mean(),
              'Median': train[useful].median(),
              'Ratio': np.abs(train[useful].mean() - train[useful].median())/(train[useful].max() - train[useful].min())}, 
             index=useful)

In [None]:
# Fill missing values with mean of each column
train.fillna(train.mean(), inplace=True)
test.fillna(test.mean(), inplace=True)

In [None]:
def model_predict(model):
    number_of_folds = 10
    fold_auc = []
    test_predictions = []

    for fold in range(number_of_folds):
        # Divide train and validation data using folds
        X_train = train[train.kfold != fold].reset_index(drop=True) # 80%
        X_valid = train[train.kfold == fold].reset_index(drop=True) # 20%
        X_test = test.copy()

        # Set target columns
        y_train = X_train.claim
        y_valid = X_valid.claim

        # Remove id, target and kfold columns
        X_train = X_train[useful]
        X_valid = X_valid[useful]
        X_test = X_test[useful]
        
        # Standard scaling
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_valid = scaler.transform(X_valid)
        X_test = scaler.transform(X_test)

        # Fit the model
        model.fit(X_train, y_train, verbose=False)

        valid_preds = model.predict_proba(X_valid)[:,1]
        auc = roc_auc_score(y_valid, valid_preds)
        fold_auc.append(auc)
        print(fold, auc)

        test_preds = model.predict_proba(X_test)[:,1]
        test_predictions.append(test_preds)

    print('Mean AUC:', np.mean(fold_auc), 'STD:', np.std(fold_auc))
    return test_predictions

In [None]:
%%time
# XGBClassifier with GPU
model = XGBClassifier(random_state=42, tree_method='gpu_hist', verbosity = 0)

# Calculate Test predictions
test_predictions = model_predict(model)
sample['claim'] = np.mean(np.column_stack(test_predictions), axis=1)
sample.to_csv('xgb1.csv', index=False)
sample.head()

In [None]:
%%time
# LGBMClassifier without GPU
model = LGBMClassifier(random_state=42)

# Calculate Test predictions
test_predictions = model_predict(model)
sample['claim'] = np.mean(np.column_stack(test_predictions), axis=1)
sample.to_csv('lgbm1.csv', index=False)
sample.head()

In [None]:
%%time
# CatBoostClassifier without GPU
model = CatBoostClassifier(random_state=42)

# Calculate Test predictions
test_predictions = model_predict(model)
sample['claim'] = np.mean(np.column_stack(test_predictions), axis=1)
sample.to_csv('cat1.csv', index=False)
sample.head()

In [None]:
df1 = pd.read_csv('./xgb1.csv')
df2 = pd.read_csv('./lgbm1.csv')
df3 = pd.read_csv('./cat1.csv')
ID = 'id'
target = 'claim'

combined = (df1[target] + df2[target] + df3[target]) / 3.0
submit = pd.DataFrame({ID: df1[ID], target: combined})
submit.to_csv('combined_try1.csv', index=False)