Installation of ax-platform for bayesian search

In [None]:
!pip install ax-platform

## Import Packages

In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, log_loss, roc_auc_score
import seaborn as sns

## Import datasets

In [None]:
train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv",index_col=0)
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv",index_col=0)
train.replace([np.inf, -np.inf], np.nan, inplace=True)
test.replace([np.inf, -np.inf], np.nan, inplace=True)


Percent of NaNs

In [None]:
df=train
print(df.dropna().shape[0])
print(df.shape[0])
(df.shape[0] - df.dropna().shape[0])/df.shape[0]

In [None]:
np.isnan(test).sum(axis=0)

## Feature Engineering

Extra feature column generation for better prediction output.

In [None]:
features = [x for x in train.columns.values if x[0]=="f"]

train['max_row'] = train[features].max(axis=1)
train['min_row'] = train[features].min(axis=1)
train['std'] = train[features].std(axis=1)
train['n_missing'] = train[features].isna().sum(axis=1)


test['max_row'] = test[features].max(axis=1)
test['min_row'] = test[features].min(axis=1)
test['std'] = test[features].std(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

In [None]:
from sklearn.impute import KNNImputer
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer,SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#imputer = KNNImputer(n_neighbors= 1)
train[:]= imputer.fit_transform(train)
test[:]= imputer.fit_transform(test)

In [None]:
sample_solution = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

## EDA & Data Preprocessing

In [None]:
train.head()

In [None]:
train["claim"].value_counts()

In [None]:
test.head()

In [None]:
sample_solution.head()

### Get Train data Targets

In [None]:
train_targets = train.pop("claim")

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (15, 71))
cols = train.columns.tolist()[1:119]
for i in cols:
    plt.subplot(24,5,cols.index(i)+1)
    sns.set_style("white")
    plt.title(i, size = 12, fontname = 'monospace')
    a = sns.kdeplot(train[i], color = '#f9ba32', linewidth = 1.3)
    sns.kdeplot(test[i], color = '#426e86', linewidth = 1.3)
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace')
    plt.yticks([])
    for j in ['right', 'left', 'top']:
        a.spines[j].set_visible(False)
        a.spines['bottom'].set_linewidth(1.2)
        
fig.tight_layout(h_pad = 3)

plt.figtext(0.335, 1.02, 'Distribution of features', color = '#2f3131', fontname = 'monospace', size = 25)
plt.figtext(0.3, 1.01, 'train', color = '#f9ba32', fontname = 'monospace', size = 18)
plt.figtext(0.66, 1.01, 'test', color = '#426e86', fontname = 'monospace', size = 18)

plt.show()

In [None]:
train_targets.head()

## Feature Scaling

In [None]:
for item in train.columns:
    if abs(train[item].max()) / (abs(train[item].min()) + 10e-10) > 20:
        train[item] = np.sign(train[item]) * np.log2(np.abs(train[item]) + 1)
        test[item] = np.sign(test[item]) * np.log2(np.abs(test[item]) + 1)
    train_mean = train[item].mean()
    train_std = train[item].std()
    train[item] = (train[item] - train_mean) / train_std
    test[item] = (test[item] - train_mean) / train_std
    # Missing Value Imputation seems to have a bad effect to final results
    #train[item].replace(np.NAN, train[item].mean(), inplace=True)
    #test[item].replace(np.NAN, test[item].mean(), inplace=True)


### Train Validation Split

In [None]:
train_features, valid_features, train_targets, valid_targets = train_test_split(train, train_targets, test_size=0.01, random_state=np.random.randint(1000))
train_features.shape, train_targets.shape, valid_features.shape, valid_targets.shape

## Model Development & Evaluation


### Evaluation Method

In [None]:
def evaluate(valid_targets, probs, name):
    y_pred = np.array(probs > 0.5, dtype=int)
    acc = accuracy_score(valid_targets, y_pred)
    loss = log_loss(valid_targets, y_pred)
    auc = roc_auc_score(valid_targets, probs)
    print("Accuracy score: %.2f"%(acc))
    print("Log loss: %.2f"%(loss))
    print("AUC score:", auc)
    print("Classification report:")
    print(classification_report(valid_targets, y_pred))
    return {
        "name": name, 
        "accuracy_score": acc, 
        "log_loss": loss, 
        "auc": auc
    }

In [None]:
train_features.shape

## Using CatBoost

 <font size="5">Hyper Parameter tunning </font>


In [None]:
def hyperparameter(params):
    cat_params={'iterations': 5000, 
      'loss_function': 'Logloss', 
      'depth':params.get('depth', 7), 
      'task_type' : 'GPU',
      'use_best_model': True,
      'eval_metric': 'AUC',
      'early_stopping_rounds': 500,
      'learning_rate': params.get('lr', 0.03),
      'border_count': params.get('bc', 32),
      'l2_leaf_reg': params.get('leaf', 3),
      "verbose": 500}
    cat = CatBoostClassifier(
        **cat_params
    )
    cat.fit(train_features, train_targets,eval_set=[(valid_features, valid_targets)])
    probs = cat.predict_proba(valid_features)[:, 1]
    return roc_auc_score(valid_targets, probs)



## Bayesian grid search

In [None]:
'''from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render
best_parameters, values, experiment, model=optimize(
    parameters=[
        {"name": "lr", "type": "range", "bounds": [1e-6, 0.1], "log_scale": True},
        {"name": 'depth', "type": "range", "bounds": [5, 10]},
        {"name": "bc", "type": "range", "bounds": [30, 40]},
        {"name": "leaf","type":"range","bounds":[0,5]},

    ],
     
    evaluation_function=hyperparameter,
    minimize=False
)'''

In [None]:
#best_parameters

In [None]:
cat_params = {
    'iterations': 20000, 
    'loss_function': 'Logloss', 
    'depth': 8, 
    'task_type' : 'GPU',
    'use_best_model': True,
    'eval_metric': 'AUC',
    'early_stopping_rounds': 1000,
    'learning_rate': 0.01,
    'border_count': 33,
    'l2_leaf_reg': 1,
    "verbose": 500
}
cat = CatBoostClassifier(
    **cat_params
)
cat.fit(train_features, train_targets, eval_set=[(valid_features, valid_targets)])

In [None]:
probs = cat.predict_proba(valid_features)[:, 1]
probs[:10]

In [None]:
result_cat = evaluate(valid_targets, probs, "catboost")
result_cat

## Submisssion

In [None]:
claim = cat.predict_proba(test)[:, 1]
#claim=cat.predict(test, num_iteration=cat.best_iteration)
sample_solution["claim"] = claim
sample_solution.to_csv("submission.csv", index=False)