## Please select an option before submitting results to the competition

In [None]:
submit_flag = True #False #True
print(submit_flag)

In [None]:
available_encodings = ['No Encoding', 'Count Encoding', 'Target Encoding', 'Target Encoding No IP', 'CatBoost Encoding']
my_encoding = available_encodings[4]
print(my_encoding)

# TalkingData AdTracking Fraud Detection Challenge
# Can you detect fraudulent click traffic for mobile app ads?
# https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection

**This notebook is inspired by an exercise in the [Feature Engineering](https://www.kaggle.com/learn/feature-engineering) course.**  
**You can reference the tutorial at [this link](https://www.kaggle.com/matleonard/categorical-encodings)**  
**You can reference my notebook at [this link](http://www.kaggle.com/georgezoto/feature-engineering-categorical-encodings)**  

---


<center><a href="https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection"><img src="https://i.imgur.com/srKxEkD.png" width=600px></a></center>

# Introduction

In this exercise you'll apply more advanced encodings to encode the categorical variables ito improve your classifier model. The encodings you will implement are:

- Count Encoding
- Target Encoding
- CatBoost Encoding

You'll refit the classifier after each encoding to check its performance on hold-out data. 

Begin by running the next code cell to set up the notebook.

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb

clicks = pd.read_parquet('../input/feature-engineering-data/baseline_data.pqt')

In [None]:
clicks.shape

In [None]:
clicks.head()

In [None]:
clicks['is_attributed'].value_counts()

In [None]:
clicks['is_attributed'].value_counts(normalize=True)

## Competition data

In [None]:
#Read only first limit rows
limit = 20_000_000

#Read only these columns - skip attributed_time 
usecols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']

In [None]:
competition_data = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train.csv', nrows=limit, usecols=usecols, parse_dates=['click_time'])

In [None]:
competition_data['is_attributed'].value_counts()

In [None]:
competition_data['is_attributed'].value_counts(normalize=True)

In [None]:
competition_test_data = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/test.csv', parse_dates=['click_time'])

In [None]:
# Add new columns for timestamp features day, hour, minute, and second
competition_test_data = competition_test_data.copy()
competition_test_data['day'] = competition_test_data['click_time'].dt.day.astype('uint8')
# Fill in the rest
competition_test_data['hour'] = competition_test_data['click_time'].dt.hour.astype('uint8')
competition_test_data['minute'] = competition_test_data['click_time'].dt.minute.astype('uint8')
competition_test_data['second'] = competition_test_data['click_time'].dt.second.astype('uint8')

In [None]:
competition_test_data.shape

In [None]:
competition_test_data.head()

Next, we define a couple functions that you'll use to test the encodings that you implement in this exercise.

In [None]:
def get_data_splits(dataframe, valid_fraction=0.1):
    """Splits a dataframe into train, validation, and test sets.

    First, orders by the column 'click_time'. Set the size of the 
    validation and test sets with the valid_fraction keyword argument.
    """

    dataframe = dataframe.sort_values('click_time')
    valid_rows = int(len(dataframe) * valid_fraction)
    train = dataframe[:-valid_rows * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_rows * 2:-valid_rows]
    test = dataframe[-valid_rows:]
    
    return train, valid, test

def train_model(train, valid, test=None, feature_cols=None):
    
    if feature_cols is None:
        feature_cols = train.columns.drop(['click_time', 'attributed_time',
                                           'is_attributed'])
    dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
    
    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    
    num_round = 1000
    
    #Record eval results for plotting
    validation_metrics = {} 
    
    bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], 
                    early_stopping_rounds=20, evals_result=validation_metrics, verbose_eval=True)
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
    print(f"Validation AUC score: {valid_score}")
    
    if test is not None: 
        test_pred = bst.predict(test[feature_cols])
        test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
        return bst, valid_score, test_score, validation_metrics
    else:
        return bst, valid_score, validation_metrics

Run this cell to get a baseline score. 

In [None]:
my_own_metrics = {}

In [None]:
if my_encoding == 'No Encoding':
    print('No Encoding model') 
    train, valid, test = get_data_splits(clicks)
    bst, valid_score, validation_metrics = train_model(train, valid)

In [None]:
if my_encoding == 'No Encoding':
    my_own_metrics['No Encoding'] = valid_score
    my_own_metrics

In [None]:
clicks.head()

### 1) Categorical encodings and leakage

These encodings are all based on statistics calculated from the dataset like counts and means. 

Considering this, what data should you be using to calculate the encodings?  Specifically, can you use the validation data?  Can you use the test data?

### 2) Count encodings

Begin by running the next code cell to get started.

### Count Encoding
Count encoding replaces each categorical value with the number of times it appears in the dataset. For example, if the value "GB" occured 10 times in the country feature, then each "GB" would be replaced with the number 10.

### Question: How about unseen values in the valid and test sets ???

In [None]:
if my_encoding != 'No Encoding':
    import category_encoders as ce
    cat_features = ['ip', 'app', 'device', 'os', 'channel']
    train, valid, test = get_data_splits(clicks)

Next, encode the categorical features `['ip', 'app', 'device', 'os', 'channel']` using the count of each value in the data set. 
- Using `CountEncoder` from the `category_encoders` library, fit the encoding using the categorical feature columns defined in `cat_features`. 
- Then apply the encodings to the train and validation sets, adding them as new columns with names suffixed `"_count"`.

In [None]:
if my_encoding == 'Count Encoding':  
    # Create the count encoder
    count_enc = ce.CountEncoder(cols=cat_features)

    # Learn encoding from the training set
    count_enc.fit(train[cat_features])

    # Apply encoding to the train and validation sets as new columns
    # Make sure to add `_count` as a suffix to the new columns
    train_encoded = train.join(count_enc.transform(train[cat_features]).add_suffix("_count")) 
    valid_encoded = valid.join(count_enc.transform(valid[cat_features]).add_suffix("_count")) 
    
    # Apply encoding to the competition test dataset
    competition_test_data = competition_test_data.join(count_enc.transform(competition_test_data[cat_features]).add_suffix("_count")) 

In [None]:
if my_encoding == 'Count Encoding':
    print('train_encoded.head()\n')
    print(train_encoded.head())
    print('\ncompetition_encoded.head()')
    print(competition_test_data.head())

Run the next code cell to see how count encoding changes the results.

In [None]:
if my_encoding == 'Count Encoding':
    # Train the model on the encoded datasets
    # This can take around 30 seconds to complete
    bst, valid_score, validation_metrics = train_model(train_encoded, valid_encoded)

In [None]:
if my_encoding == 'Count Encoding':
    my_own_metrics['Count Encoding'] = valid_score
    print(my_own_metrics)

Count encoding improved our model's score!

### 3) Why is count encoding effective?
At first glance, it could be surprising that count encoding helps make accurate models. 
Why do you think count encoding is a good idea, or how does it improve the model score?

### 4) Target encoding

Here you'll try some supervised encodings that use the labels (the targets) to transform categorical features. The first one is target encoding. 
- Create the target encoder from the `category_encoders` library. 
- Then, learn the encodings from the training dataset, apply the encodings to all the datasets, and retrain the model.

### Target encoding replaces a categorical value with the average value of the target for that value of the feature. For example, given the country value "CA", you'd calculate the average outcome for all the rows with country == 'CA', around 0.28. 

### This is often blended with the target probability over the entire dataset to reduce the variance of values with few occurences.

### Data leakage ??? blended with the target probability over the entire dataset ???

In [None]:
if my_encoding == 'Target Encoding':    
    # Create the target encoder. You can find this easily by using tab completion.
    # Start typing ce. the press Tab to bring up a list of classes and functions.
    target_enc = ce.TargetEncoder(cols=cat_features)

    # Learn encoding from the training set. Use the 'is_attributed' column as the target.
    target_enc.fit(train[cat_features], train['is_attributed'])

    # Apply encoding to the train and validation sets as new columns
    # Make sure to add `_target` as a suffix to the new columns
    train_encoded = train.join(target_enc.transform(train[cat_features]).add_suffix("_target"))
    valid_encoded = valid.join(target_enc.transform(valid[cat_features]).add_suffix("_target"))
    
    # Apply encoding to the competition test dataset
    competition_test_data = competition_test_data.join(target_enc.transform(competition_test_data[cat_features]).add_suffix("_target"))

In [None]:
if my_encoding == 'Target Encoding': 
    print('train_encoded.head()\n')
    print(train_encoded.head())
    print('\ncompetition_encoded.head()')
    print(competition_test_data.head())

Run the next cell to see how target encoding affects your results.

In [None]:
if my_encoding == 'Target Encoding': 
    bst, valid_score, validation_metrics = train_model(train_encoded, valid_encoded)

In [None]:
if my_encoding == 'Target Encoding': 
    my_own_metrics['Target Encoding'] = valid_score
    print(my_own_metrics)

### 5) Try removing IP encoding

If you leave `ip` out of the encoded features and retrain the model with target encoding, you should find that the score increases and is above the baseline score! Why do you think the score is below baseline when we encode the IP address but above baseline when we don't?

In [None]:
if my_encoding == 'Target Encoding No IP': 
    cat_features_no_ip = ['app', 'device', 'os', 'channel']
    
    target_enc = ce.TargetEncoder(cols=cat_features_no_ip)

    target_enc.fit(train[cat_features_no_ip], train['is_attributed'])

    train_encoded = train.join(target_enc.transform(train[cat_features_no_ip]).add_suffix("_target"))
    valid_encoded = valid.join(target_enc.transform(valid[cat_features_no_ip]).add_suffix("_target"))
    
    # Apply encoding to the competition test dataset
    competition_test_data = competition_test_data.join(target_enc.transform(competition_test_data[cat_features_no_ip]).add_suffix("_target"))

In [None]:
if my_encoding == 'Target Encoding No IP': 
    print('train_encoded.head()\n')
    print(train_encoded.head())
    print('\ncompetition_encoded.head()')
    print(competition_test_data.head())

In [None]:
if my_encoding == 'Target Encoding No IP': 
    bst, valid_score, validation_metrics = train_model(train_encoded, valid_encoded)

In [None]:
if my_encoding == 'Target Encoding No IP': 
    my_own_metrics['Target Encoding No IP'] = valid_score
    print(my_own_metrics)

### 6) CatBoost Encoding

The CatBoost encoder is supposed to work well with the LightGBM model. Encode the categorical features with `CatBoostEncoder` and train the model on the encoded data again.

In [None]:
if my_encoding == 'CatBoost Encoding': 
    # Remove IP from the encoded features
    cat_features = ['app', 'device', 'os', 'channel']

    # Create the CatBoost encoder
    cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7)

    # Learn encoding from the training set
    cb_enc.fit(train[cat_features], train['is_attributed'])

    # Apply encoding to the train and validation sets as new columns
    # Make sure to add `_cb` as a suffix to the new columns
    train_encoded = train.join(cb_enc.transform(train[cat_features]).add_suffix("_cb"))
    valid_encoded = valid.join(cb_enc.transform(valid[cat_features]).add_suffix("_cb"))
    
    # Apply encoding to the competition test dataset
    competition_test_data = competition_test_data.join(cb_enc.transform(competition_test_data[cat_features]).add_suffix("_cb"))

In [None]:
if my_encoding == 'CatBoost Encoding':
    print('train_encoded.head()\n')
    print(train_encoded.head())
    print('\ncompetition_encoded.head()')
    print(competition_test_data.head())

Run the next code cell to see how the CatBoost encoder changes your results.

In [None]:
if my_encoding == 'CatBoost Encoding':
    bst, valid_score, validation_metrics = train_model(train_encoded, valid_encoded)

In [None]:
if my_encoding == 'CatBoost Encoding':
    my_own_metrics['CatBoost Encoding'] = valid_score
    print(my_own_metrics)

# Keep Going

Now you are ready to **[generate completely new features](https://www.kaggle.com/matleonard/feature-generation)** from the data.

---




*Have questions or comments? Visit the [Learn Discussion forum](https://www.kaggle.com/learn-forum/161443) to chat with other Learners.*

## Model information

In [None]:
bst.num_trees()

## Plot model performance

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [16,9]

ax = lgb.plot_metric(validation_metrics, metric='auc');

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (16,9)

def plot_my_own_metrics(my_own_metrics):
    x=list(my_own_metrics.keys())
    y=list(my_own_metrics.values())
    plt.barh(x, y);

    for index, value in enumerate(y):
        plt.text(value, index, str(value))

In [None]:
plot_my_own_metrics(my_own_metrics)

## ML Explainability and taking a closer look at feature importance, individual trees
Inspired by: https://github.com/Microsoft/LightGBM/blob/2e93cdab9eee02d4d7f5cb3b6b31128dec94e25e/examples/python-guide/plot_example.py

In [None]:
bst.num_trees()

In [None]:
tree_index = 0
print('Plot '+str(tree_index)+'th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(bst, tree_index=tree_index, figsize=(64, 36), show_info=['split_gain'])
plt.show()

In [None]:
print('Plot feature importances...')
ax = lgb.plot_importance(bst, max_num_features=15)
plt.show()

## Submit test predictions to TalkingData AdTracking Fraud Detection Challenge competition using the limited train.csv records from this notebook

In [None]:
competition_test_data.head()

In [None]:
feature_cols = competition_test_data.columns.drop(['click_id', 'click_time'])
feature_cols

In [None]:
competition_predictions = bst.predict(competition_test_data[feature_cols])

In [None]:
competition_predictions

In [None]:
competition_predictions_df = pd.DataFrame(competition_predictions, columns=['is_attributed'])
competition_predictions_df

In [None]:
competition_predictions_df['click_id'] = competition_test_data['click_id']
competition_predictions_df = competition_predictions_df[['click_id', 'is_attributed']]
competition_predictions_df

In [None]:
pd.cut(competition_predictions_df['is_attributed'], bins=10).value_counts()

In [None]:
pd.cut(competition_predictions_df['is_attributed'], bins=10).value_counts().plot(kind='bar', rot=45);

In [None]:
#available_encodings = ['No Encoding', 'Count Encoding', 'Target Encoding', 'Target Encoding No IP', 'CatBoost Encoding']

if my_encoding == 'No Encoding':
    my_own_metrics['private score'] = 0.88343
    my_own_metrics['public score'] = 0.89154
    
elif my_encoding == 'Count Encoding':
    my_own_metrics['private score'] = 0.77323
    my_own_metrics['public score'] = 0.77341    
    
elif my_encoding == 'Target Encoding':
    my_own_metrics['private score'] = 0.55412
    my_own_metrics['public score'] = 0.55253    
    
elif my_encoding == 'Target Encoding No IP':
    my_own_metrics['private score'] = 0.74309
    my_own_metrics['public score'] = 0.76522    

elif my_encoding == 'CatBoost Encoding':
    my_own_metrics['private score'] = 0.79337
    my_own_metrics['public score'] = 0.81397 
    
print(my_own_metrics)

In [None]:
if submit_flag == True:
    competition_predictions_df.to_csv('submission.csv', index=False)
    print('submission.csv generated successfully :)')

## Submit csv to competition