# Loading data

In [None]:
! nvidia-smi
!pip3 install catboost 

In [None]:
# Data Preprocessing
import pandas as pd
import numpy as np 

# Data Visualization
import matplotlib.pyplot as plt


# ML Models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,roc_curve 
from catboost import CatBoostClassifier

# Miscellanous
import time 

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv') 
train_df = train.copy() 
train_df = train_df.drop(['id'], axis='columns') 
train_df.head()

In [None]:
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
test_df = test.copy()
test_id = test.copy()
test_df = test_df.drop(['id'], axis='columns')
test_df.head()

In [None]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

In [None]:
y = train_df['claim']
train_df.drop('claim', axis = 1, inplace  = True)

# Extra features: 'missing', 'st', 'var'

In [None]:
train_df['missing'] = train_df.isnull().sum(axis=1)
train_df['std'] = train_df.std(axis=1)
train_df['var'] = train_df.var(axis=1)
train_df.head() 

In [None]:
test_df['missing'] = test_df.isnull().sum(axis=1)
test_df['std'] = test_df.std(axis=1)
test_df['var'] = test_df.var(axis=1)

# Replace missing values with the mean value of each column


In [None]:
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(train_df)
scaled_features1 = scaler.transform(test_df)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, y, test_size = 0.30)

# CatBoost Classifier



In [None]:
cat_params = {
    'iterations': 15585, 
    'objective': 'CrossEntropy', 
    'bootstrap_type': 'Bernoulli',
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 8, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'eval_metric' : 'AUC',
    'verbose' : 1000,
    'early_stopping_rounds' : 500,
}

In [None]:
cat = CatBoostClassifier(**cat_params )
cat.fit(X_train, y_train)
test_predict = cat.predict(X_test) 

In [None]:
start_time = time.time()

y_pred_proba = cat.predict_proba(X_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
print(auc)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

print("Execution time: " + str((time.time() - start_time)) + ' ms')

In [None]:
y_predi=cat.predict_proba(test_df)[:, 1]

In [None]:
id = test_id['id']

# Test Data Predictions



In [None]:
sub = pd.DataFrame(list(zip(id, y_predi)),columns =['id', 'claim'])
sub.to_csv('submission.csv',index = False)
sub.head(20)

In [None]:
sub.to_csv('submission.csv',index = False)