# Feature Engineering in Splice Machine
#### Let's start our Spark Session

In [None]:
# Setup
from pyspark.sql import SparkSession
from splicemachine.spark import PySpliceContext
from splicemachine.mlflow_support.utilities import get_user

spark = SparkSession.builder.getOrCreate()
splice = PySpliceContext(spark)
schema = get_user()

## Ingesting Data
<blockquote><p class='quotation'><span style='font-size:15px'> Using the table created in <a href='./7.3 Data Exploration.ipynb'>7.3 Data Exploration</a>, we will create features first with <code>SQL</code> and subsequently ingest into <code>PySpark</code> for further analysis. <footer>Splice Machine</footer>
</blockquote>

## Within the same platform, we're able to easily engineer features in a number of ways 

With native access to the Splice Database, we can engineer features using SQL. But we can also use PySpark, or external libraries, like Koalas.

#### Here we'll calculate a couple simple features in SQL 
We're computing a couple quick scaled differenced features for our use

In [None]:
%%sql
SELECT     
    time_offset,
    expected_weekly_trans_cnt,
    expected_weekly_trans_amnt,
    expected_daily_trans_cnt,
    expected_daily_trans_amnt,
    weekly_trans_cnt,
    weekly_trans_amnt,
    daily_trans_cnt,
    daily_trans_amnt,
    rolling_avg_weekly_trans_cnt,
    rolling_avg_weekly_trans_amnt,
    rolling_avg_daily_trans_cnt,
    rolling_avg_daily_trans_amnt,
    MACD_trans_amnt,
    MACD_trans_cnt,
    RSI_trans_amnt,
    RSI_trans_cnt,
    Aroon_trans_amnt,
    Aroon_trans_cnt,
    ADX_trans_amnt,
    ADX_trans_cnt,
    current_balance,
    rolling_avg_balance,
    MACD_balance,
    Aroon_balance,
    RSI_balance,
    ADX_balance,
    credit_score,
    credit_limit,
    amount,
    (weekly_trans_cnt - expected_weekly_trans_cnt)/expected_weekly_trans_cnt AS weekly_trans_cnt_DIFF,
    (weekly_trans_amnt - expected_weekly_trans_amnt)/expected_weekly_trans_amnt AS weekly_trans_amnt_DIFF,
    (daily_trans_cnt - expected_daily_trans_cnt)/expected_daily_trans_cnt AS daily_trans_cnt_DIFF,
    (daily_trans_amnt - expected_daily_trans_amnt)/expected_daily_trans_amnt AS daily_trans_amnt_DIFF
FROM CC_FRAUD_DATA
{LIMIT 10}

## Ingesting the data with these new features into splice machine

In [None]:
sdf = splice.df(f"""
SELECT     
    time_offset,
    expected_weekly_trans_cnt,
    expected_weekly_trans_amnt,
    expected_daily_trans_cnt,
    expected_daily_trans_amnt,
    weekly_trans_cnt,
    weekly_trans_amnt,
    daily_trans_cnt,
    daily_trans_amnt,
    rolling_avg_weekly_trans_cnt,
    rolling_avg_weekly_trans_amnt,
    rolling_avg_daily_trans_cnt,
    rolling_avg_daily_trans_amnt,
    MACD_trans_amnt,
    MACD_trans_cnt,
    RSI_trans_amnt,
    RSI_trans_cnt,
    Aroon_trans_amnt,
    Aroon_trans_cnt,
    ADX_trans_amnt,
    ADX_trans_cnt,
    current_balance,
    rolling_avg_balance,
    MACD_balance,
    Aroon_balance,
    RSI_balance,
    ADX_balance,
    credit_score,
    credit_limit,
    amount,
    (weekly_trans_cnt - expected_weekly_trans_cnt) AS weekly_trans_cnt_DIFF,
    (weekly_trans_amnt - expected_weekly_trans_amnt) AS weekly_trans_amnt_DIFF,
    (daily_trans_cnt - expected_daily_trans_cnt) AS daily_trans_cnt_DIFF,
    (daily_trans_amnt - expected_daily_trans_amnt) AS daily_trans_amnt_DIFF,
    CLASS_RESULT
FROM {schema}.CC_FRAUD_DATA""")
sdf.select(sdf.columns[-8:]).describe().show()

## Now engineering features with PySpark
Here, we calculate the z-score for the AMOUNT feature based on class result

In [None]:
from beakerx.object import beakerx
beakerx.pandas_display_table()

In [None]:
from pyspark.sql import functions as F

df_zscore = sdf.join(F.broadcast(sdf.groupBy("CLASS_RESULT").agg(
                        F.stddev_pop("AMOUNT").alias("AMOUNT_sd"), 
                        F.avg("AMOUNT").alias("AMOUNT_avg"))),
             "CLASS_RESULT")\
        .withColumn("AMOUNT_Z_SCORE", (F.col("AMOUNT") - F.col("AMOUNT_avg")) / F.col("AMOUNT_sd")).limit(150)

df_zscore.select('AMOUNT','WEEKLY_TRANS_CNT_DIFF','WEEKLY_TRANS_AMNT_DIFF','DAILY_TRANS_CNT_DIFF','DAILY_TRANS_AMNT_DIFF', 'AMOUNT_Z_SCORE').toPandas()

## We can do the same thing with Koalas if that is preferred

Now calculating the z score normalization by class group for `current_balance` manually with Koalas

In [None]:
from scipy.stats import zscore
import databricks.koalas as ks
ks.set_option('compute.ops_on_diff_frames', True)
kdf = df_zscore.to_koalas()
kdf['CURRENT_BALANCE_Z_SCORE'] = kdf.groupby("CLASS_RESULT").CURRENT_BALANCE.transform(lambda x: zscore(x))
    
kdf[['AMOUNT','WEEKLY_TRANS_CNT_DIFF','WEEKLY_TRANS_AMNT_DIFF','DAILY_TRANS_CNT_DIFF','DAILY_TRANS_AMNT_DIFF', 'AMOUNT_Z_SCORE','CURRENT_BALANCE_Z_SCORE']]

In [None]:
spark.stop()

# Fantastic!
<blockquote> 
Now you can start building basic and advanced feature engineering tasks in both SQL and PySpark! <br>
    Next Up: <a href='./7.5 Model Creation.ipynb'>Using MLManager to create basic machine learning models.</a>
<footer>Splice Machine</footer>
</blockquote>