In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, log_loss, roc_auc_score
from warnings import filterwarnings
from sklearn.metrics import roc_curve, auc
filterwarnings('ignore')


import os
for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")

## EDA

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.isnull().sum()

In [None]:
test.head()

In [None]:
test.isnull().sum()

## Feature Engineering

**find some usefull things, (mean, std, min, max) now, after unless feature will removed.**

In [None]:
# number_of_cols = [col for col in train.columns]
number_of_cols = [col for col in test.columns]
train["mean"] = train[number_of_cols].mean(axis=1)
test["mean"] = test[number_of_cols].mean(axis=1)

train["std"] = train[number_of_cols].std(axis=1)
test["std"] = test[number_of_cols].std(axis=1)

train["min"] = train[number_of_cols].min(axis=1)
test["min"] = test[number_of_cols].min(axis=1)

train["max"] = train[number_of_cols].max(axis=1)
test["max"] = test[number_of_cols].max(axis=1)

***Our data is to large and take much time to load this is very helpful post, to deal with large size of data.***
https://towardsdatascience.com/how-to-work-with-million-row-datasets-like-a-pro-76fb5c381cdd

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
    
            # test if column can be converted to an integer
            asint = props[col].astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            else:
                props[col] = props[col].astype(np.float32)
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props

*Check-out the above function how its impect and assign returning dataframe into train and test*

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
train.head()

Choosing Boolian type columns in both train and test 

In [None]:
bol_col_train = []
for i, col in enumerate(train.columns):
    if train[col].dtypes == bool:
        bol_col_train.append(i)

In [None]:
bol_col_test = []
for i, col in enumerate(test.columns):
    if test[col].dtypes == bool:
        bol_col_test.append(i)

In [None]:
train.iloc[:,bol_col_train] = train.iloc[:,bol_col_train].astype(int)
test.iloc[:,bol_col_test] = test.iloc[:,bol_col_test].astype(int)

In [None]:
train.head()

In [None]:
X = train.drop(columns=['id', 'target']).copy()
y = train['target'].copy()
X_test = test.copy()

In [None]:
params = {
        'iterations': 15000, 
        'loss_function': 'Logloss', 
        'depth': 8, 
        'task_type' : 'GPU',
        'use_best_model': True,
        'eval_metric': 'AUC',
        'early_stopping_rounds': 1000,
        'learning_rate': 0.03,
        'border_count': 32,
        'l2_leaf_reg': 3,
        "verbose": 1000
    }

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

preds = []
scores = []

for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    model = CatBoostClassifier(**params)
    
    model.fit(X_train,y_train,
              eval_set=[(X_valid,y_valid)],
              early_stopping_rounds=100,
              verbose=False)
    
    pred_valid = model.predict_proba(X_valid)[:,1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}" "\n")
    print('||'*40, "\n")
    
    test_preds = model.predict_proba(X_test)[:,1]
    preds.append(test_preds)
    
print(f"Overall Validation Score: {np.mean(scores)}")
# kfold.split(train, train_targets)

In [None]:
predictions = np.mean(np.column_stack(preds),axis=1)

submission['target'] = predictions
submission.to_csv('./catboost.csv', index=False)
submission.head()

### NOW CLICK UPVOTE BUTTON......Cheers