### Goal :
 The goal of the task is to predict whether there will be a claim happening or not based on the given variables
 
##### Here we going to use rapids, cuml where the whole notebook runs with in few minutes. We are going to use an ensemble of Random Forest and XGBoost to test out

In [None]:
import numpy as np 
import pandas as pd 
import math
import glob
import os
import gc
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import time
%matplotlib inline


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

#### Read Data and Identify data patterns and trends

In [None]:
train=pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sample_submission=pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
print("Train has", train.shape[0], "elements")
print("Test has", test.shape[0], "elements")


#### Lets take a look at train data to check NA values and data distrbution

In [None]:
train.head(10)

In [None]:
dd=(train.isna().sum()/train.shape[0]).reset_index(name='percentage_na')
dd=dd.set_index('index')
dd.T

In [None]:
features=[f for f in train.columns.tolist() if 'f' in f]

Most of the columns have less than 1%-2% of missing data

### Lets try out Random Forest and Xgboost in Rapids

In [None]:
import cudf
import pandas as pd

import pynvml
import numpy as np
import xgboost as xgb


data = cudf.from_pandas(train)
for col in features:
    data[col]=data[col].astype('float32')
## spliting training and test set
from cuml import train_test_split
X=data[features]
y=data.iloc[:,119]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
from cuml.ensemble import RandomForestClassifier
X_train.fillna(0, inplace=True)
model = RandomForestClassifier(n_estimators = 140, max_depth =10 )
model.fit(X_train, y_train)

In [None]:

from cuml.metrics import accuracy_score
from cupy import asnumpy
X_test.fillna(0, inplace=True)
rf_prediction = model.predict_proba(X_test)
threshold=0.38
rf_predictions = (rf_prediction[1] >= threshold).astype(int)
#rf_predictions=np.round(rf_prediction,0).astype(int)
cu_score = accuracy_score( y_test, rf_predictions )

print("cuml Accuracy: %.2f%%" % (cu_score * 100.0))


In [None]:
start_rapids = time.time()
dtrain = xgb.DMatrix(
        X_train,
        y_train    )

dtest = xgb.DMatrix(
        X_test,
        y_test    )

## Train the model
trained_model = xgb.train(
                        {
                          'nround': 100,
                          'max_depth': 4,
                          'max_leavs': 2**8,
                          'loss': 'ls',
                          'objective': 'binary:logistic',
                          'n_estimators':120,
                          'max_features': 'auto',
                          'criterion' : 'friedman_mse',
                          'grow_policy': 'lossguide',
                          'silent': True,
                          'verbose_eval': True,
                          'tree_method':'gpu_hist',
                        },
                        dtrain,
                        num_boost_round=400, evals=[(dtrain, 'train')])

## Predict the model

end_rapids = time.time()
print("Total time taken", end_rapids-start_rapids, "seconds")

In [None]:

from cuml.metrics import accuracy_score
from cupy import asnumpy

prediction = trained_model.predict(dtest)
prediction=np.round(prediction,0).astype(int)
cu_score = accuracy_score( y_test, prediction )

print("cuml Accuracy: %.2f%%" % (cu_score * 100.0))


In [None]:
test.fillna(0, inplace=True)
test[features]=test[features].astype('float32')
test_data = cudf.from_pandas(test[features])
dtest_actual = xgb.DMatrix(
        test_data  )
boost_pred=trained_model.predict(dtest_actual)

rf_pred=model.predict_proba(test_data)

In [None]:
final_preds=asnumpy(rf_pred[1]*0.3)+asnumpy(.8*boost_pred)
test['claim']=np.round(final_preds)
test['claim']=test['claim'].astype(int)

In [None]:
test[['id','claim']].to_csv('submission.csv', index=False)

### Please upvote if you find the notebook Useful. I will be working on adding new features and visualizations. 