# Libraries

In [None]:
import numpy as np
import pandas as pd
from utils.fraud_pre_proc import *
from utils.fraud_feat_engineering import *


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("dark")


# Tuning knobs 

### 1. Data Cleaning
A. Number of low NaN-rate features: nans_rate_cut_off (parameter) <br>
B. 
- Numerical_categorical split: min_categories (parameter)
- Methods to fill NaNs: Vasilis or Papes  <br>

### 2. Feature Engineering 
A. Which Datetime Feats to add: select manually <br>
B1. Which Card-Address Interaction Feats to add: : select manually <br>
B2. Which Card-Address-Datetime Interaction Feats to add:<br>
$\quad$ i) period_feats (list)<br>
$\quad$ ii) card_addr_feats (list)<br>
C. Which Aggregated TransAmt Feats to add: select manually <br>
D. Which Frequency Feats to add: select manually <br>

### 3. Preprocessing and Feature Selection 
Numerical_categorical split: min_categories (parameter)<br>
A. Number of highly correlated features: corr_cut_off (parameter) <br>
B. Method of treating categorical feats: how = {'dummies','label_enc'} <br>

### 4. Stratified Split 
- Stratified split parameters: frac, n_splits<br>
- (PCA)

# 0. Import data 

In [None]:
#trans data
df_all = pd.read_csv('./Data/df_trans_ids_imp.csv')
df_all.head(3)

In [None]:
from pickleObjects import *

In [None]:
path = './Data/'

cat_cols = loadObjects(path+'cat_cols_all')
num_cols = loadObjects(path+'num_cols_all')
cols = loadObjects(path+'cols_all')

# 2. Feature Engineering

### A. Datetime Features

In [None]:
period_feats=addDatetimeFeats(df_all)

### B. Interaction Features

In [None]:
# #B.1 Add Interaction Features by ADDING the values of card_ and addr_ columns
# card_addr_interactions = addCardAddressInteractionFeats(df_all)

In [None]:
# df_all[card_addr_interactions].head(3)

In [None]:
# card_addr_feats = card_addr_interactions + ['card1','card2','card3','card5']

In [None]:
#B.2 Add interaction features by ADDING the values of card_addr_feats and period_feats
#   and computing value frequencies
# new_feats = addDatetimeInteractionFeats(df_all, cols=card_addr_feats, period_cols=period_feats);

### C. Aggregated Features

In [None]:
cards = ['card1','card2','card3','card5']

In [None]:
# Add aggregated features by grouping-by card_addr_feats and computing the mean & STD of 'TransactionAmt'
agg_feats = addAggTransAmtFeats(df_all,cols=cards);

### D. Indicator/Frequency Features

In [None]:
try_cols = cards + ['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
        'D1','D2','D3','D4','D5','D6','D7','D8',
        'addr1','addr2',
        'dist1','dist2',
        'P_emaildomain', 'R_emaildomain',
        'DeviceInfo','DeviceType',
        'id_30','id_33']

In [None]:
# Add indicator features by computing the value frequencies of try_cols
freq_feats = addFrequencyFeats(df_all,cols=try_cols);

In [None]:
print_null_cols(df_all)

In [None]:
new_cols = period_feats + agg_feats + freq_feats

In [None]:
df_all.head(5)

# 3. Preprocessing and Feature Selection 

In [None]:
df_fraud = pd.read_csv('./Data/train_transaction.csv',usecols = ['isFraud'])

In [None]:
df_train = df_all.iloc[:len(df_fraud),:].copy()

In [None]:
df_train.head(3)

In [None]:
df_train['isFraud'] = df_fraud['isFraud'].values

In [None]:
all_vars = TableDescriptor(df_train,'All_data','isFraud')

## A. Filter Features by Correlation to target



In [None]:
#convert cat cols to cat vars
numerical_vars = [var for var in all_vars.variables if var.name in num_cols+new_cols]
#select high-correlated cat vars
num_vars = getCorrelatedFeatures(numerical_vars,corr_cut_off=0.005)
#list of high-correlated cat cols
new_num_cols = [var.name for var in num_vars]

In [None]:
#convert cat cols to cat vars
categorical_vars = [var for var in all_vars.variables if var.name in cat_cols]
#select high-correlated cat vars
cat_vars = getCorrelatedFeatures(categorical_vars,corr_cut_off=0.1)
#list of high-correlated cat cols
new_cat_cols = [var.name for var in cat_vars]

## B. Convert categorical data to Dummies or Codes

In [None]:
all_cols = new_cat_cols + new_num_cols

In [None]:
#all_cols.remove('TransactionDT')

In [None]:
df_all = to_categorical(df=df_all[all_cols],cat_cols=new_cat_cols,how='dummies')
df_all.shape

In [None]:
print_null_cols(df_all)

# PCA

In [None]:
# # feature scalingmodel.best_performance
num_cols.remove('TransactionDT')
X_train = df_all[new_num_cols].values

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)

X_train_scaled=scaler.transform(X_train)


PCAT = PCATransformer(X_train_scaled)

In [None]:
df_all['pca_error_2'] = PCAT.rec_error(X_train_scaled).reshape(-1,1)

# 4. Stratified Split training and validation data

In [None]:
df_train_dummy = df_all.iloc[:len(df_fraud),:].copy()

In [None]:
x_cols = [col for col in df_train_dummy.columns.tolist() if col not in ['TransactionID','isFraud']]
y_col = 'isFraud'

In [None]:
#define X and y of df_train

X, y = df_train_dummy.loc[:,x_cols].values, df_train.loc[:,y_col].values

X_train, X_test, y_train, y_test = getStratifiedTrainTestSplit(X,y,frac=0.2,n_splits=1,
                                                                random_state=0)

In [None]:
#df's shapes

for i in [X_train, X_test, y_train, y_test]:
    print(i.shape)

In [None]:
df_test = df_all.loc[len(df_fraud):,x_cols].copy()
df_test.shape

## Save analyzed data

In [None]:
from pickleObjects import *

In [None]:
path = './Data/'

dumpObjects(X_train,path+'X_train')
dumpObjects(y_train,path+'y_train')
dumpObjects(X_test,path+'X_test')
dumpObjects(y_test,path+'y_test')
dumpObjects(df_test.values,path+'X_test_comp')