# Feature Engineering Workshop for Analytics Vidhya

This wokshop will walk through incrementally building feature from transactional data for a customer behavior use case. The data is synthetic and was created to support a churn use case.

This notebook can be found [here](https://github.com/rasgointelligence/feature-engineering-tutorials/tree/main/workshops/analytics-vidhya/workshop_2022_02.ipynb)

## Packages

The documentation for each packaged used in this tutorial is linked below:
* [numpy](https://numpy.org/doc/stable/)
* [pandas](https://pandas.pydata.org/docs/)
* [scikit-learn](https://scikit-learn.org/stable/)
    * [sklearn.preprocessing](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing)
* [category_encoders](https://contrib.scikit-learn.org/category_encoders/)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

## Get customer and transaction data

Grab customer and transcation data from GitHub repo and 

In [None]:
customer_url = "https://raw.githubusercontent.com/engelAnalytics/feature-engineering/main/data/churn/customer.csv"
transactions_url = "https://raw.githubusercontent.com/engelAnalytics/feature-engineering/main/data/churn/transactions.csv"

customer = pd.read_csv(customer_url, parse_dates=['Birthdate', 'Churn_date'])
transactions = pd.read_csv(transactions_url, parse_dates=['Transaction_Date'])
customer.columns

In [None]:
transactions.columns

## Create moving aggregates

In [None]:
rolling_window_180 = transactions.sort_values(by=['CustomerID', 'Transaction_Date']).set_index('Transaction_Date').groupby('CustomerID').rolling('180D', closed='both')

In [None]:
agg_df = pd.DataFrame()
agg_df['Cost_min_180D'] = rolling_window_180["Cost"].min()
agg_df['Cost_max_180D'] = rolling_window_180["Cost"].max()
agg_df['Cost_mean_180D'] = rolling_window_180["Cost"].mean()
agg_df['Cost_std_180D'] = rolling_window_180["Cost"].std()
agg_df['Cost_sum_180D'] = rolling_window_180["Cost"].sum()

In [None]:
agg_df['Purchase_count_180D'] = rolling_window_180["Cost"].count()

In [None]:
transactions.merge(agg_df.reset_index(), on=['CustomerID', 'Transaction_Date'], how='left').head(20)

### Product level aggregations

In [None]:
prod_df = pd.DataFrame()
for prod in sorted(transactions['Product'].unique()):
    product_window_180 = transactions[transactions.Product == prod].sort_values(by=['Transaction_Date']).set_index('Transaction_Date').rolling('180D', closed='left')
    
    tmpdf = pd.DataFrame()
    tmpdf['Product_Cost_mean_180D'] = product_window_180["Cost"].mean()
    tmpdf['Product_Cost_std_180D'] = product_window_180["Cost"].std()
    tmpdf['Product_count_180D'] = product_window_180["Product_Count"].count()
    tmpdf['Product'] = prod
    
    tmpdf = tmpdf.reset_index().sort_values(by=['Transaction_Date', 'Product_count_180D']).drop_duplicates(subset=['Transaction_Date'], keep='first')
    prod_df = pd.concat([prod_df, tmpdf])
    
prod_df.head()

In [None]:
transactions.merge(prod_df, on=['Product', 'Transaction_Date'], how='left').sort_values(by=['Product', 'Transaction_Date']).head(20)

### State level aggregations

In [None]:
full_transactions = transactions.merge(customer, on=['CustomerID'], how='left')

In [None]:
state_df = pd.DataFrame()
for state in sorted(full_transactions['State'].unique()):
    state_window_180 = full_transactions[full_transactions.State == state].sort_values(by=['Transaction_Date']).set_index('Transaction_Date').rolling('180D', closed='left')
    
    tmpdf = pd.DataFrame()
    tmpdf['State_Cost_mean_180D'] = state_window_180["Cost"].mean()
    tmpdf['State_Cost_std_180D'] = state_window_180["Cost"].std()
    tmpdf['State_count_180D'] = state_window_180["Product_Count"].count()
    tmpdf['State'] = state
    
    tmpdf = tmpdf.reset_index().sort_values(by=['Transaction_Date', 'State_count_180D']).drop_duplicates(subset=['Transaction_Date'], keep='first')
    state_df = pd.concat([state_df, tmpdf])
    
state_df.head()

## Merge data

In [None]:
transdf = transactions.merge(customer, on=['CustomerID'], how='left')

In [None]:
transdf = transdf.merge(agg_df.reset_index(), on=['CustomerID', 'Transaction_Date'], how='left')

In [None]:
transdf = transdf.merge(prod_df, on=['Product', 'Transaction_Date'], how='left')

In [None]:
transdf = transdf.merge(state_df, on=['State', 'Transaction_Date'], how='left')

transdf.head(20)

## Ratios, differences and distances

In [None]:
transdf['Cost_ratio'] = transdf['Cost'] / transdf['Cost_mean_180D']

In [None]:
transdf['Cost_distance'] = (transdf['Cost'] - transdf['Cost_mean_180D']) / transdf['Cost_std_180D']

In [None]:
transdf['Cost_distance_mag'] = np.abs(transdf['Cost_distance'])

In [None]:
transdf['Cost_ratio_product'] = transdf['Cost'] / transdf['Product_Cost_mean_180D']
transdf['Cost_distance_product'] = (transdf['Cost'] - transdf['Product_Cost_mean_180D']) / transdf['Product_Cost_std_180D']
transdf['Cost_distance_product_mag'] = np.abs(transdf['Cost_distance_product'])

In [None]:
transdf['Cost_ratio_state'] = transdf['Cost'] / transdf['State_Cost_mean_180D']
transdf['Cost_distance_state'] = (transdf['Cost'] - transdf['State_Cost_mean_180D']) / transdf['State_Cost_std_180D']
transdf['Cost_distance_state_mag'] = np.abs(transdf['Cost_distance_state'])

In [None]:
transdf.head(20)

## Lags

In [None]:
custgroup = transdf.groupby('CustomerID')

In [None]:
transdf['Cost_lag_1'] = custgroup['Cost'].shift(1)
transdf['Date_lag_1'] = custgroup['Transaction_Date'].shift(1)

In [None]:
transdf['Date_Delta_1'] = (transdf['Transaction_Date'] - transdf['Date_lag_1']).dt.days

In [None]:
transdf['Cost_distance_lag_1'] = custgroup['Cost_distance'].shift(1)

In [None]:
transdf['Cost_delta_1'] = transdf['Cost'] - transdf['Cost_lag_1']

In [None]:
transdf['Cost_velocity_days'] = transdf['Cost_delta_1'] / transdf['Date_Delta_1']

In [None]:
transdf['Cost_velocity_days_lag_1'] = custgroup['Cost_velocity_days'].shift(1)
transdf['Cost_velocity_days_delta_1'] = transdf['Cost_velocity_days'] - transdf['Cost_velocity_days_lag_1']

In [None]:
transdf['Cost_acceleration_days'] = transdf['Cost_velocity_days_delta_1'] / transdf['Date_Delta_1']

In [None]:
transdf['Cost_lag_2'] = custgroup['Cost'].shift(2)
transdf['Date_lag_2'] = custgroup['Transaction_Date'].shift(2)
transdf['Date_Delta_2'] = (transdf['Transaction_Date'] - transdf['Date_lag_2']).dt.days
transdf['Cost_distance_lag_2'] = custgroup['Cost_distance'].shift(2)
transdf['Cost_delta_2'] = transdf['Cost'] - transdf['Cost_lag_2']

In [None]:
transdf['Cost_velocity_days_2'] = transdf['Cost_delta_2'] / transdf['Date_Delta_2']
transdf['Cost_velocity_transactions_2'] = transdf['Cost_delta_2'] / 2

In [None]:
transdf['Cost_lag_3'] = custgroup['Cost'].shift(3)
transdf['Cost_lag_4'] = custgroup['Cost'].shift(4)

In [None]:
transdf['Cost_moving_average_4'] = (5*transdf['Cost'] + 4*transdf['Cost_lag_1'] + 3*transdf['Cost_lag_2'] + 2*transdf['Cost_lag_3'] + transdf['Cost_lag_4'])/15

## Categorical trend variables

In [None]:
def trend_string(row):
    """Use the lag variables from cost to create a string representing the trends. 
'=' - value remains the same, 
'+' - value increasing, 
'-' - value decreasing.
"""
    
    trend = ''
    if np.isnan(row['Cost_lag_1']):
        trend = ' '
    elif row['Cost'] > row['Cost_lag_1']:
        trend = '+'
    elif row['Cost'] < row['Cost_lag_1']:
        trend = '-'
    else:
        trend = '='
    
    if np.isnan(row['Cost_lag_2']):
        trend += ' '
    elif row['Cost_lag_1'] > row['Cost_lag_2']:
        trend += '+'
    elif row['Cost_lag_1'] < row['Cost_lag_2']:
        trend += '-'
    else:
        trend += '='
    
    if np.isnan(row['Cost_lag_3']):
        trend += ' '
    elif row['Cost_lag_2'] > row['Cost_lag_3']:
        trend += '+'
    elif row['Cost_lag_2'] < row['Cost_lag_3']:
        trend += '-'
    else:
        trend += '='
        
    if np.isnan(row['Cost_lag_4']):
        trend += ' '
    elif row['Cost_lag_3'] > row['Cost_lag_4']:
        trend += '+'
    elif row['Cost_lag_3'] < row['Cost_lag_4']:
        trend += '-'
    else:
        trend += '='
    
    return trend

In [None]:
transdf['Cost_trend'] = transdf.apply(trend_string, axis=1)

transdf[['Cost', 'Cost_lag_1', 'Cost_lag_2', 'Cost_lag_3', 'Cost_lag_4', 'Cost_trend']].head(20)

### Encoding Categorical Variables

There are multiple ways to encode categorical variables. Before we explore these techniques, we need to setup our target and train-test split.

### Create target

In [None]:
transdf['Days_to_Churn'] = (transdf['Churn_date'] - transdf['Transaction_Date']).dt.days
transdf['Churn'] = transdf['Days_to_Churn'].apply(lambda x: 1 if not pd.isnull(x) and x <= 30 else 0)
tmpdf = transdf[transdf.Churn_date.apply(lambda x: not pd.isnull(x))]
tmpdf[['Transaction_Date', 'Churn_date', 'Churn']].tail()

### Train-test split

In [None]:
min_date = transdf.Transaction_Date.min()
max_date = transdf.Transaction_Date.max()
print("Min:", min_date, "Max:", max_date)

In [None]:
train_percent = .8
time_between = max_date - min_date
train_cutoff = min_date + train_percent*time_between
train_cutoff

In [None]:
train_df = transdf[transdf.Transaction_Date <= train_cutoff]
test_df = transdf[transdf.Transaction_Date > train_cutoff]
print("Train:", train_df.Transaction_Date.min(), train_df.Transaction_Date.max())
print("Test:", test_df.Transaction_Date.min(), test_df.Transaction_Date.max())

### One-hot encoding

In [None]:
train_df.select_dtypes('object').columns

In [None]:
train_df_ohe = train_df[['Product', 'Gender']].copy()
test_df_ohe = test_df[['Product', 'Gender']].copy()
train_df_ohe.shape

In [None]:
oh_encoder = OneHotEncoder(handle_unknown="ignore")
train_df_ohe_encoded = oh_encoder.fit_transform(train_df_ohe)
train_df_ohe_encoded.shape

In [None]:
test_df_ohe_encode = oh_encoder.transform(test_df_ohe)
test_df_ohe_encode.shape

### Target encoding

In [None]:
train_target = train_df['Churn']

train_df_loo = train_df[['City', 'State', 'Cost_trend']].copy()
test_df_loo = test_df[['City', 'State', 'Cost_trend']].copy()
train_df_loo.shape

In [None]:
loo_encoder = ce.LeaveOneOutEncoder(return_df=True)
train_df_loo_encode = loo_encoder.fit_transform(train_df_loo, train_target)
train_df_loo_encode.shape

In [None]:
test_df_loo_encode = loo_encoder.transform(test_df_loo)
test_df_loo_encode.shape

In [None]:
train_df_loo_encode.head()