In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import sklearn
import sklearn.metrics as sm
import lightgbm as lgb

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import catboost as cb
from catboost import CatBoostClassifier
pd.options.display.max_rows = 1500
pd.options.display.max_columns = 130


# Read Datasets

In [None]:
train_set = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test_set = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
train_set.head()

In [None]:
train_set.shape

In [None]:
target_category = train_set["claim"].unique()
target_category=list(map(str,target_category))
print(target_category)

# Check Null Values

In [None]:
train_set.isnull().sum().sort_values(ascending =False)

In [None]:
y= train_set['claim']
x= train_set.drop(columns=['claim','id'])
print(len(x.columns))

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size = 0.3, random_state = 60,shuffle=True)
print(len(X_train))
print(len(X_test))


# Use SimpleImputer to replace missing values with the mean value of each column

In [None]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(my_imputer.transform(X_test))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

# CatBoost Classifier

In [None]:
cat_params = {
    'iterations': 15585, 
    'objective': 'CrossEntropy', 
    'bootstrap_type': 'Bernoulli',
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 8, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'eval_metric' : 'AUC',
    'verbose' : 1000,
    'early_stopping_rounds' : 500,
}


In [None]:
cat = CatBoostClassifier(**cat_params )

cat.fit(X_train, Y_train)

test_predict = cat.predict(X_test)

In [None]:
y_pred_proba = cat.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(Y_test,  y_pred_proba)
auc = metrics.roc_auc_score(Y_test, y_pred_proba)
print(auc)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

# Test set

In [None]:
test_set.isnull().sum().sort_values(ascending = False)

In [None]:
test_id = test_set['id']
test_x= test_set.drop(columns=['id'])
print(len(test_x.columns))

In [None]:
# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(test_x))

# Imputation removed column names; put them back
imputed_X_train.columns = X_test.columns


In [None]:
y_prdict =cat.predict_proba(test_x)[:, 1]

In [None]:
submission = pd.DataFrame(list(zip(test_id, y_prdict)),
               columns =['id', 'claim'])
submission.head(20)

In [None]:
submission.to_csv('submission.csv', index=False)