In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import datatable as dt
import optuna
import gc

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from xgboost import XGBClassifier

In [None]:
train_df = dt.fread('/kaggle/input/tabular-playground-series-oct-2021/train.csv').to_pandas()
test_df = dt.fread('/kaggle/input/tabular-playground-series-oct-2021/test.csv').to_pandas()
sample_df = dt.fread('/kaggle/input/tabular-playground-series-oct-2021/sample_submission.csv').to_pandas()

Similarly to the September TPS, I've chosen datatable to import the dataframes, as it should be a bit faster.

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
print(f'train_df shape: {train_df.shape}')
print(f'NaNs in train_df: {train_df.isna().sum().sum()}\n')
print(f'test_df shape: {test_df.shape}')
print(f'NaNs in test_df: {test_df.isna().sum().sum()}\n')
print(f'sample_df shape: {sample_df.shape}')
print(f'NaNs in sample_df: {sample_df.isna().sum().sum()}')

In [None]:
'''A function to reduce the amount of memory taken up by each feature by compressing it to the appropriate datatype
verbose parameter is used to output a message regarding the exact memory usage reduction'''
def reduce_memory_usage(df, verbose=True):
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2 #initial memory usage to compare to
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            #extract the min and max values
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                #else:
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2 #new memory_usage
    if verbose:
        print(
            "Memory usage decreased to: {:.2f} Mb - {:.1f}% reduction".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
                
                )
            )
    return df

In [None]:
for i, col in enumerate(train_df.columns):
    if train_df[col].dtypes == bool:
        train_df[col] = train_df[col].astype(int)

In [None]:
for i, col in enumerate(test_df.columns):
    if test_df[col].dtypes == bool:
        test_df[col] = test_df[col].astype(int)

In [None]:
train_df.dtypes

In [None]:
test_df.dtypes

In [None]:
print("X_train reduction:")
train_df = reduce_memory_usage(train_df)
print("X_test reduction:")
test_df = reduce_memory_usage(test_df)

import warnings
warnings.filterwarnings("ignore")

fig = plt.figure(figsize = (30,60))
ax = fig.gca()
hist = train_df.hist(bins = 50, layout = (30,10), color='k', alpha=0.5,  ax = ax)

In [None]:
from sklearn.cluster import KMeans

In [None]:
def generate_cluster_columns(dataframe):
    # List of feature name with suggested number of clusters
    for_kmeans = [('f33', 2), ('f34', 2), ('f44', 2), ('f49', 2), ('f51', 2), ('f72', 2), ('f95', 2), ('f110', 2),
                  ('f117', 2), ('f126', 2), ('f127', 2), ('f130', 2), ('f134', 2), ('f138', 2), ('f139', 2), ('f140', 2),
                  ('f141', 2), ('f142', 2), ('f143', 2), ('f144', 2), ('f146', 2), ('f150', 2), ('f152', 2), ('f153', 2),
                  ('f157', 2), ('f158', 2), ('f208', 2), ('f11', 3), ('f23', 3), ('f28', 3), ('f68', 3), ('f94', 3),
                  ('f124', 3), ('f125', 3), ('f136', 3), ('f196', 3), ('f198', 3), ('f30', 4), ('f84', 4), ('f98', 4),
                  ('f104', 4), ('f118', 4), ('f175', 4), ('f188', 4), ('f222', 4), ('f239', 4), ('f2', 5), ('f10', 5),
                  ('f70', 5), ('f105', 5), ('f120', 5), ('f122', 5), ('f178', 5), ('f200', 5), ('f73', 6),]

    for f, n_clusters in for_kmeans:
        data = dataframe[[f]].values
        km = KMeans(n_clusters = n_clusters, n_init = 50)
        dataframe[f] = km.fit_predict(data)

In [None]:
generate_cluster_columns(dataframe=train_df)
train_df.head()

In [None]:
test_df.head()

In [None]:
generate_cluster_columns(dataframe=test_df)
test_df.head()

In [None]:
X_train = train_df.drop(['id', 'target'], axis=1)
y_train = train_df['target'].copy()

X_test = test_df.drop('id', axis=1)

In [None]:
del train_df
del test_df
gc.collect()

{
    'max_depth': trial.suggest_categorical('max_depth', [4, 6, 8, 10, 15, 20]),
    'n_estimators': trial.suggest_categorical('max_depth', [5000, 10000,15000]),
    'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
    'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.2,0.4,0.6,0.8,1.0]),
    'colsample_bylevel': trial.suggest_categorical('colsample_bytree', [0.2,0.4,0.6,0.8,1.0]),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
    'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 100.0),
    'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
    'gamma': trial.suggest_float('gamma', 1, 100),
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'use_label_encoder': False
}

In [None]:
def objective(trial,data=X_train,target=y_train):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {
        'max_depth': trial.suggest_int('max_depth',3,15),
        'n_estimators': trial.suggest_categorical('n_estimators', [4000, 5000, 6000]),
        'subsample': trial.suggest_float('subsample',0.15,0.995,log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree',0.15,0.995,log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 100.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'gamma': trial.suggest_categorical('gamma',[0, 0.25, 0.5, 1.0]),
        'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']),
        'gpu_id': trial.suggest_categorical('gpu_id',[0]),
        'predictor' : trial.suggest_categorical('predictor',['gpu_predictor']),
        'random_state': trial.suggest_categorical('random_state',[42]),
        'booster': trial.suggest_categorical('booster',['gbtree']),
        'eval_metric': trial.suggest_categorical('eval_metric',['auc']),
        'use_label_encoder': trial.suggest_categorical('use_label_encoder',[False]),
    }
    model = XGBClassifier(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    roc_auc = roc_auc_score(test_y, preds)
    
    return roc_auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
params= study.best_trial.params

In [None]:
x_tra, x_val, y_tra, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=42)

In [None]:
del X_train
del y_train
gc.collect()

In [None]:
model = XGBClassifier(**params)  

model.fit(x_tra,y_tra,eval_set=[(x_val,y_val)], eval_metric='auc',early_stopping_rounds=200,verbose=250)

In [None]:
y_pred = model.predict_proba(X_test)

In [None]:
sample_df['target'] = y_pred[:,1]

In [None]:
sample_df.to_csv('submission_73.csv', index=False)