In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os,gc

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier

from scipy.stats import chi2_contingency, chisquare

In [7]:
TRAIN_FEATURES_PATH = "kaggle/datafest/data/train_data_fe/"
TEST_FEATURES_PATH = "kaggle/datafest/data/test_data_fe/"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
os.chdir('/content/drive/MyDrive')

## Preprocessing


In [6]:
# target variable

train_target = pd.read_csv('kaggle/datafest/data/train_target.csv')

print(f"Proportion of positive class {np.round((train_target['flag'] == 1).mean(), 2)}")

Proportion of positive class 0.04


In [9]:
# check the shape of train dataframes

all_trains = list()
number_of_features = 450
nums = [num for num in range(12)]
for num in nums:
  train = pd.read_parquet(TRAIN_FEATURES_PATH + f"train_fe_{num}.pq")
  assert train.shape[1] == number_of_features, f"Train part {num} has \
                                                 different number of features"
  all_trains.append(train)

del train, nums
gc.collect()

0

In [10]:
# check the shape of test dataframes

all_tests = list()
nums = [num for num in range(2)]
for num in nums:
  test = pd.read_parquet(TEST_FEATURES_PATH + f"test_fe_{num}.pq")
  assert test.shape[1] == number_of_features, f"Test part {num} has \
                                                 different number of features"
  all_tests.append(test)
  
del test, nums, number_of_features
gc.collect()

0

In [11]:
# concat the test set
whole_test = pd.concat(all_tests)

# concat the train set
whole_train = pd.concat(all_trains)

del all_tests, all_trains
gc.collect()

print(f"Whole test shape: {whole_test.shape}\n")
print(f"Whole train shape: {whole_train.shape}")

Whole test shape: (500000, 450)

Whole train shape: (3000000, 450)


Check the dependence of each feature with target variable:


In [12]:
def chi2_feature_selection(dataframe: pd.DataFrame, cat_features: list,
                           target: np.array, alpha: float) -> list:
    """
    The function determines independent features from the target variable.

    H0 - feature and target variable are independent(feature is not important)
    H1 - feature and target variable are dependent(feature is important)
    
    :param dataframe: pd.DataFrame to select the features
    :param cat_features: list of categorical features
    :param target: np.array target variable
    :param alpha: significant level
    :return: list of features which are independent from the target variable
    """
    unimportant_cols = []
    for col in cat_features:
      data = pd.crosstab(dataframe[col], target)
      stat, p, dof, expected = chi2_contingency(data)
      if p > alpha: # Variables are independent (fail to reject H0)
        unimportant_cols.append(col)

      del data, stat, p, dof, expected
    gc.collect()
    print(f"There are(is) {len(unimportant_cols)} unimportant cols.")

    return unimportant_cols

In [13]:
# find independent features from the target variable

cols_to_del = chi2_feature_selection(whole_train, whole_train.columns,
                                     train_target['flag'], 0.05)

There are(is) 31 unimportant cols.


In [14]:
# drop these features

whole_train.drop(cols_to_del, axis=1, inplace=True)
whole_test.drop(cols_to_del, axis=1, inplace=True)

In [15]:
# define the features to train the models

FTS = whole_train.columns

del cols_to_del, chi2_feature_selection
gc.collect()

print(f"Finally we have {len(FTS)} features.")

Finally we have 419 features.


## Modeling


In [29]:
def chi2_model_eval(data: np.array, alpha: float) -> None:
    """
    The function calculates whether model is random guesser or not.

      H0 - variables in contingency table are independent 
           (model is a random guesser)
      H1 - variables in contingency table are dependent
           (model is not a random guesser)

    :param data: model's confusion matrix
    :param alpha: significant level
    
    """

    stat, p, dof, expected = chi2_contingency(data)
    print(f"Chi-square statistic = {stat}")
    print(f"P-value = {p}")
    print(f"Alpha = {alpha}")

    if p < alpha:
        print('\nModel is Not a random guesser (reject H0).')
    else:
        print('\nModel ia a random guesser (fail to reject H0).')

In [21]:
X = whole_train[FTS].copy()
y = train_target['flag']

# number of folds
n_splits = 6
# cv 
skf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
# list for classifiers
clfs = []

# cv training
for train_index, val_index in skf.split(X, y):
    # split data
    X_train = X.iloc[train_index]
    X_val = X.iloc[val_index]
    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]
    del train_index, val_index
    # model
    clf = CatBoostClassifier(iterations=2000,
                             max_depth=8,
                             verbose=0,
                             eval_metric='AUC',
                             early_stopping_rounds=200,
                             class_weights=[0.5, 10],
                             use_best_model=True,
                             task_type='GPU',
                             random_state=42)
    # train the model
    clf.fit(X_train, y_train, eval_set=(X_val, y_val))
    # best score on validation
    print(f"Best score is {clf.best_score_['validation']['AUC']}")
    # check if the model is a random guesser
    val_prediction = clf.predict(X_val)
    conf_matrix = confusion_matrix(y_val, val_prediction)
    # test of independence
    print(f"\nChecking if a model is a random guesser.")
    chi2_model_eval(conf_matrix, 0.05)
    # add the model to the list of classifiers
    clfs.append(clf)

    del X_train, X_val, y_train, y_val, clf, val_prediction, conf_matrix

Default metric period is 5 because AUC is/are not implemented for GPU


Best score is 0.7634491622447968

Checking if a model is a random guesser.

Chi-square statistic = 13240.687246181918
P-value = 0.0
Alpha = 0.05

Model is Not a random guesser (reject H0).


Default metric period is 5 because AUC is/are not implemented for GPU


Best score is 0.7659362256526947

Checking if a model is a random guesser.

Chi-square statistic = 13487.165824821632
P-value = 0.0
Alpha = 0.05

Model is Not a random guesser (reject H0).


Default metric period is 5 because AUC is/are not implemented for GPU


Best score is 0.7625938057899475

Checking if a model is a random guesser.

Chi-square statistic = 12991.855103903044
P-value = 0.0
Alpha = 0.05

Model is Not a random guesser (reject H0).


Default metric period is 5 because AUC is/are not implemented for GPU


Best score is 0.7611766755580902

Checking if a model is a random guesser.

Chi-square statistic = 12645.048874905262
P-value = 0.0
Alpha = 0.05

Model is Not a random guesser (reject H0).


Default metric period is 5 because AUC is/are not implemented for GPU


Best score is 0.7655340433120728

Checking if a model is a random guesser.

Chi-square statistic = 13664.572574473237
P-value = 0.0
Alpha = 0.05

Model is Not a random guesser (reject H0).


Default metric period is 5 because AUC is/are not implemented for GPU


Best score is 0.7615915536880493

Checking if a model is a random guesser.

Chi-square statistic = 13131.777303250652
P-value = 0.0
Alpha = 0.05

Model is Not a random guesser (reject H0).


In [22]:
# calculate the average score

avg_score = 0
for model in clfs:
  avg_score += model.best_score_['validation']['AUC'] / n_splits

avg_score

0.7633802443742753

In [23]:
# predict the test

oof = np.zeros(shape=whole_test.shape[0])
for model in clfs:
  oof += model.predict_proba(whole_test[FTS])[:, 1] / len(clfs)

In [26]:
# create submission df

kf_submission = pd.DataFrame({
    "id" : whole_test.index.values,
    "score": oof
}) 

In [27]:
# save submission

kf_submission.to_csv('kaggle/datafest/data/kf_submission_00.csv', index=False)

In [28]:
del oof, kf_submission