In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# STEPS FOLLOWED

* Importing the Required libraries
* Importing the Datasets
* Learning more about the Dataset
* Exploratory Data Analysis
* Handling Missing Values
* Feature Scaling of Data
* Understanding Evaluation Metric
* Model Validation and Prediction
* Making the final submission

## Importing all the required libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Importing the Training and Test dataset

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv")

## Getting info about the Training and Test Data

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.shape)
print(test.shape)

There are 957,919 rows and 120 columns in the training data.

There are 493,474 rows and 119 columns in the test data.

Using the features ranging from f1-f118, we were asked to predict the claim(0/1) - **CLASSIFICATION PROBLEM**

All the features are anonymized, so we wont be able to get much insights based on the features.

## Handling Null Values

![](https://miro.medium.com/max/1400/1*2L2lSHCYCYQTDoHTy2o9Kw.png)

In [None]:
train = train.fillna(train.mean())
test = test.fillna(train.mean())

## Feature Scaling of Data

Most of the values seem to be scaled already. But some columns like f9 do seem to have very high values.

![](https://www.gstatic.com/education/formulas2/355397047/en/z_score.svg)

In [None]:
train.drop("id",axis=1,inplace=True)
test_id = test['id']
test.drop("id",axis=1,inplace=True)

In [None]:
y = train['claim']
train.drop("claim",axis=1,inplace=True)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [None]:
scaled_train = ss.fit_transform(train)
scaled_test = ss.transform(test)

## Exploratory Data Analysis

In [None]:
import seaborn as sns
train.corr()

## ROC-AUC Score

ROC and Precision Recall curves are tools that helps in the interpretation of how our classfier model is working. ROC stands for `Reciever Operating Characterestic` curves. Generally ROC is used for datasets with balanced data and precision recall curve when we have imbalanced dataset.

For understanding how ROC and Precision Recall Curves work, first you need to understand about `Confusion Matrix`.

### Confusion Matrix

Confusion matrix also refrerred to as error matrix, allows visualization of the performance of algorithm. It shows the way in which our model is actually confused. It segregates the error into 4 classifications - True Positive, True Negative, False Positive, False Negative.

This is how a confusion matrix generally looks like:

![](https://miro.medium.com/max/1400/1*85t6zbUiQA0fotnhDaJLaA.png)


So basically what ROC curve will contain is 

1. `False Positive Rate ` on x-axis
2. `True Positive Rate ` on y-axis
3. Variation of Probability Threshold(0-1)

`True Positive Rate ` gives us information about how good the model is predicting when the actual value is positive.

`False Positive Rate ` gives us information about how often a class is predicted as positive when it is actually negative. `FPR ` is referred to as ` 1 - specificity ` .

![](https://www.researchgate.net/profile/Md-Ashraful-Amin/publication/220176738/figure/fig4/AS:669969142534168@1536744499664/The-confusion-matrix-left-and-the-calculation-of-true-positive-rate-false-positive.png)

## Intepretations from ROC curve

1. ROC curve shows us the tradeoff between sensitivity(TPR) and specificity(1-FPR). 

2. Classifiers that give curves closer to the top-left corner indicate a better performance.

3. Smaller x-axis on the ROC plot means we have low false positive values and high true negative values.

4. Higher values on y-axis means we have high true positive values and lower false negative values.

5. AUC(Area under ROC curve) is used to summarize the performance of the classification model. It is equivalent to the probability that a randomly chosen positive instance is ranked higher than a randomly chosen negative instance.

## Train Test Splitting

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_train, y, test_size=0.3)

In [None]:
import catboost as cb
cb_model = cb.CatBoostClassifier(learning_rate = 0.19, iterations=1000)
cb_model.fit(scaled_train,y)
pred = cb_model.predict_proba(scaled_test)

Make sure to use the `predict_proba()` function, if you want to find out the probabaility of it being 1.

If you use the `predict() ` function, then the model will be doing binary classification.

The `predict()` function gave a score of around 0.75, whereas `predict_proba()` gave a score of around 0.79545.

In [None]:
pred = cb_model.predict_proba(scaled_test)
new = pred[:,1]
output = pd.DataFrame({'id': test_id, 'claim': new})
output.to_csv('submission_tp5.csv', index=False)

#### This submission gave a score of 0.79545

# Extras


## Trying out with Voting Classifier 

Used 3 different models and stacked using Voting Classifier.

Got an accuracy around 0.78.

In [None]:
#from catboost import CatBoostRegressor
#from xgboost import XGBRegressor
#from lightgbm import LGBMRegressor
#from sklearn.ensemble import VotingRegressor

#estimator = []
#estimator.append(('cb', CatBoostRegressor()))
#estimator.append(('XGB', XGBRegressor()))
#estimator.append(('lgb', LGBMRegressor()))
  
# Voting Classifier with hard voting
#vot_hard = VotingRegressor(estimators = estimator)
#vot_hard.fit(scaled_train, y)
#y_pred = vot_hard.predict(scaled_test)

## Next Step:

Parameter tuning of Catboost

Feature Selection Techniques