## Please select an option before submitting results to the competition

In [None]:
submit_flag = True #False #True
print(submit_flag)

# TalkingData AdTracking Fraud Detection Challenge
# Can you detect fraudulent click traffic for mobile app ads?
# https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection

**This notebook is inspired by an exercise in the [Feature Engineering](https://www.kaggle.com/learn/feature-engineering) course**  
**You can reference the tutorial at [this link](https://www.kaggle.com/matleonard/feature-selection)**  
**You can reference my notebook at [this link](https://www.kaggle.com/georgezoto/feature-engineering-feature-selection/)**  

---


<center><a href="https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection"><img src="https://i.imgur.com/srKxEkD.png" width=600px></a></center>

# Introduction

In this exercise you'll use some feature selection algorithms to improve your model. Some methods take a while to run, so you'll write functions and verify they work on small samples.

To begin, run the code cell below to set up the exercise.

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb

import os

clicks = pd.read_parquet('../input/feature-engineering-data/baseline_data.pqt')
data_files = ['count_encodings.pqt',
              'catboost_encodings.pqt',
              'interactions.pqt',
              'past_6hr_events.pqt',
              'downloads.pqt',
              'time_deltas.pqt',
              'svd_encodings.pqt']
data_root = '../input/feature-engineering-data'
for file in data_files:
    features = pd.read_parquet(os.path.join(data_root, file))
    clicks = clicks.join(features)

def get_data_splits(dataframe, valid_fraction=0.1):

    dataframe = dataframe.sort_values('click_time')
    valid_rows = int(len(dataframe) * valid_fraction)
    train = dataframe[:-valid_rows * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_rows * 2:-valid_rows]
    test = dataframe[-valid_rows:]
    
    return train, valid, test

def train_model(train, valid, test=None, feature_cols=None, valid_name_model='Baseline Model'):
    if feature_cols is None:
        feature_cols = train.columns.drop(['click_time', 'attributed_time',
                                           'is_attributed'])
    dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
    
    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    num_round = 1000
    
    #Record eval results for plotting
    validation_metrics = {} 
    
    print("Training model!")
    bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], valid_names=valid_name_model,
                    early_stopping_rounds=20, evals_result=validation_metrics, verbose_eval=False)
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
    print(f"Validation AUC score: {valid_score}")
    
    if test is not None: 
        test_pred = bst.predict(test[feature_cols])
        test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
        return bst, valid_score, test_score, validation_metrics
    else:
        return bst, valid_score, validation_metrics

In [None]:
def my_own_train_plot_model(clicks, valid_name_model, my_own_metrics):
    #valid_name_model='V11 FI Numerical ip_past_6hr_counts Model'
    print(valid_name_model+' score')

    train, valid, test = get_data_splits(clicks)
    bst, valid_score, validation_metrics = train_model(train, valid, valid_name_model=valid_name_model)

    my_own_metrics[valid_name_model] = valid_score
    print(my_own_metrics)
    plot_model_information(bst, validation_metrics, my_own_metrics)

In [None]:
def my_own_train_plot_model_v2(train, valid, valid_name_model, my_own_metrics):
    #valid_name_model='V11 FI Numerical ip_past_6hr_counts Model'
    print(valid_name_model+' score')

    bst, valid_score, validation_metrics = train_model(train, valid, valid_name_model=valid_name_model)

    my_own_metrics[valid_name_model] = valid_score
    print(my_own_metrics)
    plot_model_information(bst, validation_metrics, my_own_metrics)

## Model information

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (16,9)

def plot_model_information(bst, validation_metrics, my_own_metrics):
    print('Number of trees:', bst.num_trees())
    
    print('Plot model performance')
    ax = lgb.plot_metric(validation_metrics, metric='auc');
    plt.show()
    
    print('Plot feature importances...')
    ax = lgb.plot_importance(bst, max_num_features=15)
    plt.show()
    
    def plot_my_own_metrics(my_own_metrics):
        x=list(my_own_metrics.keys())
        y=list(my_own_metrics.values())
        plt.barh(x, y);

        for index, value in enumerate(y):
            plt.text(value, index, str(value))

    print('plot_my_own_metrics')    
    plot_my_own_metrics(my_own_metrics)
    plt.show()
    
    tree_index = 0
    print('Plot '+str(tree_index)+'th tree...')  # one tree use categorical feature to split
    ax = lgb.plot_tree(bst, tree_index=tree_index, figsize=(64, 36), show_info=['split_gain'])
    plt.show()

In [None]:
clicks.shape

In [None]:
clicks.head()

In [None]:
clicks['is_attributed'].value_counts()

In [None]:
clicks['is_attributed'].value_counts(normalize=True)

## Competition data

In [None]:
#Read only first limit rows
limit = 20_000_000

#Read only these columns - skip attributed_time 
usecols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']

In [None]:
competition_data = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train.csv', 
                               nrows=limit, 
                               usecols=usecols, 
                               parse_dates=['click_time'])

In [None]:
competition_data['is_attributed'].value_counts()

In [None]:
competition_data['is_attributed'].value_counts(normalize=True)

In [None]:
competition_test_data = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/test.csv', 
                                    parse_dates=['click_time'])

In [None]:
# Add new columns for timestamp features day, hour, minute, and second
competition_test_data = competition_test_data.copy()
competition_test_data['day'] = competition_test_data['click_time'].dt.day.astype('uint8')
# Fill in the rest
competition_test_data['hour'] = competition_test_data['click_time'].dt.hour.astype('uint8')
competition_test_data['minute'] = competition_test_data['click_time'].dt.minute.astype('uint8')
competition_test_data['second'] = competition_test_data['click_time'].dt.second.astype('uint8')

In [None]:
competition_test_data.shape

In [None]:
competition_test_data.head()

## Baseline Score

Let's look at the baseline score for all the features we've made so far.

In [None]:
my_own_metrics = {}
valid_name_model='Baseline LightGBM Model'
bst = my_own_train_plot_model(clicks, valid_name_model, my_own_metrics)

### 1) Which data to use for feature selection?

Since many feature selection methods require calculating statistics from the dataset, should you use all the data for feature selection?

Now we have 91 features we're using for predictions. With all these features, there is a good chance the model is overfitting the data. We might be able to reduce the overfitting by removing some features. Of course, the model's performance might decrease. But at least we'd be making the model smaller and faster without losing much performance.

### 2) Univariate Feature Selection

Below, use `SelectKBest` with the `f_classif` scoring function to choose 40 features from the 91 features in the data. 

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
feature_cols = clicks.columns.drop(['click_time', 'attributed_time', 'is_attributed'])
train, valid, test = get_data_splits(clicks)

# Create the selector, keeping 40 features
selector = SelectKBest(f_classif, k=40)

# Use the selector to retrieve the best features
X_new = selector.fit_transform(train[feature_cols], train['is_attributed'])

# Get back the kept features as a DataFrame with dropped columns as all 0s
selected_features = pd.DataFrame(selector.inverse_transform(X_new),
                                index=train.index,
                                columns=feature_cols)

# Find the columns that were dropped
dropped_columns = selected_features.columns[selected_features.var() == 0]
#print(dropped_columns)

In [None]:
dropped_columns

In [None]:
valid_name_model='Top40 f_classif Univariate Feature Selection'
bst = my_own_train_plot_model_v2(train.drop(dropped_columns, axis=1), 
                                 valid.drop(dropped_columns, axis=1),
                                 valid_name_model, 
                                 my_own_metrics)

### 3) The best value of K

With this method we can choose the best K features, but we still have to choose K ourselves. How would you find the "best" value of K? That is, you want it to be small so you're keeping the best features, but not so small that it's degrading the model's performance.

### 4) Use L1 regularization for feature selection

Now try a more powerful approach using L1 regularization. Implement a function `select_features_l1` that returns a list of features to keep.

Use a `LogisticRegression` classifier model with an L1 penalty to select the features. For the model, set:
- the random state to 7,
- the regularization parameter to 0.1,
- and the solver to `'liblinear'`.

Fit the model then use `SelectFromModel` to return a model with the selected features.

The checking code will run your function on a sample from the dataset to provide more immediate feedback.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

def select_features_l1(X, y):
    """Return selected features using logistic regression with an L1 penalty."""

    logistic = LogisticRegression(C=0.1, penalty='l1', solver='liblinear', random_state=7).fit(X,y)
    model = SelectFromModel(logistic, prefit=True)

    X_new = model.transform(X)
    
    # Get back the kept features as a DataFrame with dropped columns as all 0s
    selected_features = pd.DataFrame(model.inverse_transform(X_new), 
                                 index=X.index,
                                 columns=X.columns)

    # Dropped columns have values of all 0s, keep other columns 
    selected_columns = selected_features.columns[selected_features.var() != 0]
    
    return selected_columns

In [None]:
n_samples = 10_000
X, y = train[feature_cols][:n_samples], train['is_attributed'][:n_samples]
selected = select_features_l1(X, y)

dropped_columns = feature_cols.drop(selected)

In [None]:
dropped_columns

In [None]:
valid_name_model='L1 regularization LogisticRegression 0.1'
bst = my_own_train_plot_model_v2(train.drop(dropped_columns, axis=1), 
                                 valid.drop(dropped_columns, axis=1),
                                 valid_name_model, 
                                 my_own_metrics)

### 5) Feature Selection with Trees

Since we're using a tree-based model, using another tree-based model for feature selection might produce better results. What would you do different to select the features using a trees classifier?

### 6) Top K features with L1 regularization

Here you've set the regularization parameter `C=0.1` which led to some number of features being dropped. However, by setting `C` you aren't able to choose a certain number of features to keep. What would you do to keep the top K important features using L1 regularization?

Congratulations on finishing this course! To keep learning, check out the rest of [our courses](https://www.kaggle.com/learn/overview). The machine learning explainability and deep learning courses are great next skills to learn!

---




*Have questions or comments? Visit the [Learn Discussion forum](https://www.kaggle.com/learn-forum/161443) to chat with other Learners.*