In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn import metrics # to use roc_curve and auc value
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

%matplotlib inline
sns.set(style='whitegrid')
sns.color_palette("Spectral", as_cmap=True)
plt.rcParams['figure.figsize']=(16,9)
myScore={}

# EDA

In [None]:
# Load the train and test data
train = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv')
test_id = test.id

In [None]:
#Check the shape
print("Train Shape:",train.shape)
print("Test Shape:",test.shape)

In [None]:
features=train.columns.tolist()
features.remove('id')
features.remove('claim')

In [None]:
def missing_values(df):
    total = df.isna().sum()
    percent = df.isna().sum()/df.count()
    missing_data=pd.concat([total,percent],axis=1,keys=['Total','Percent'])
    figure,ax = plt.subplots()
    sns.barplot(x=missing_data.index,y=missing_data['Percent'])
    ax.set_xticklabels(missing_data.index,rotation='90')
    ax.set_xlabel('Features')
    ax.set_ylabel('Percent of Missing Values')
    ax.set_title('Train Missing Values in Percent')
    
    return missing_data
    

In [None]:
train_missing_data=missing_values(train)

In [None]:
test_missing_data=missing_values(test)

#### From the result it can be seen that there is about 1.7% missing data for train and test. So we need fill in the missing value.
#### Since this is a baseline prediction so will impute missing value with mean.

In [None]:
train[features]=train[features].fillna(train[features].mean())
test[features]=test[features].fillna(test[features].mean())

In [None]:
print("Train Missing Value",train.isna().sum()[train.isna().sum()>0].count())
print("Test Missing Value",test.isna().sum()[test.isna().sum()>0].count())

In [None]:
train[features].describe()

In [None]:
# From the data it can be seen that it is spread from less than zero to greater than 1e16 so for better prediction
# it is better to standardize them with a Standard scaler

scaler = StandardScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

# Base Line Predicition

In [None]:
def oof(model,train,y,test, nfold):
    folds=StratifiedKFold(n_splits=nfold,shuffle=True,random_state=42)
    test_pred=np.zeros((test.shape[0],2),float)
    train_pred=np.empty((0,1),float)
  
    for train_indices,val_indices in folds.split(train,y.values):
        x_train,x_val=train.iloc[train_indices],train.iloc[val_indices]
        y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]
        model.fit(X=x_train,y=y_train)
        y_train_pred=model.predict_proba(x_val)
        fpr, tpr, thresholds = metrics.roc_curve(y_val.values, y_train_pred[:,1], pos_label=1)
        print("ROC AUC:",metrics.auc(fpr, tpr))
        train_pred=np.append(train_pred,y_train_pred[:,1])
    
        test_pred +=model.predict_proba(test)/nfold
    
    return test_pred,train_pred

In [None]:
X = train[features]
y = train.claim
X_test = test[features]

In [None]:
%%time
rf_model=RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_depth=5,
    random_state=42,
    n_jobs = -1)

print("Running RF Model")
rf_test_pred ,rf_train_pred=oof(model=rf_model,nfold=5, train=X,test=X_test,y=y)

#fpr, tpr, thresholds = metrics.roc_curve(y.values, rf_train_pred[:,1], pos_label=1)
#myScore['rf_model'] = metrics.auc(fpr,tpr)
#print("RF Model RMSE",myScore['rf_model'])

In [None]:
rf_train_pred.shape

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y.values, rf_train_pred, pos_label=1)
myScore['rf_model'] = metrics.auc(fpr,tpr)
print("RF Model RMSE",myScore['rf_model'])

In [None]:
output=pd.DataFrame({'id':test_id,'claim':rf_test_pred[:,1]})
output.to_csv('rf_baseline.csv',index=False)