# Feature Engineering in Splice Machine
#### Let's start our Spark Session

In [3]:
# Setup
from pyspark.sql import SparkSession
from splicemachine.spark import PySpliceContext
from splicemachine.mlflow_support.utilities import get_user

spark = SparkSession.builder.getOrCreate()
splice = PySpliceContext(spark)
schema = get_user()

## Ingesting Data
<blockquote><p class='quotation'><span style='font-size:15px'> Using the table created in <a href='./7.3 Data Exploration.ipynb'>7.3 Data Exploration</a>, we will create features first with <code>SQL</code> and subsequently ingest into <code>PySpark</code> for further analysis. <footer>Splice Machine</footer>
</blockquote>

## Within the same platform, we're able to easily engineer features in a number of ways 

With native access to the Splice Database, we can engineer features using SQL. But we can also use PySpark, or external libraries, like Koalas.

#### Here we'll calculate a couple simple features in SQL 
We're computing a couple quick scaled differenced features for our use

In [1]:
%%sql
SELECT     
    time_offset,
    expected_weekly_trans_cnt,
    expected_weekly_trans_amnt,
    expected_daily_trans_cnt,
    expected_daily_trans_amnt,
    weekly_trans_cnt,
    weekly_trans_amnt,
    daily_trans_cnt,
    daily_trans_amnt,
    rolling_avg_weekly_trans_cnt,
    rolling_avg_weekly_trans_amnt,
    rolling_avg_daily_trans_cnt,
    rolling_avg_daily_trans_amnt,
    MACD_trans_amnt,
    MACD_trans_cnt,
    RSI_trans_amnt,
    RSI_trans_cnt,
    Aroon_trans_amnt,
    Aroon_trans_cnt,
    ADX_trans_amnt,
    ADX_trans_cnt,
    current_balance,
    rolling_avg_balance,
    MACD_balance,
    Aroon_balance,
    RSI_balance,
    ADX_balance,
    credit_score,
    credit_limit,
    amount,
    (weekly_trans_cnt - expected_weekly_trans_cnt)/expected_weekly_trans_cnt AS weekly_trans_cnt_DIFF,
    (weekly_trans_amnt - expected_weekly_trans_amnt)/expected_weekly_trans_amnt AS weekly_trans_amnt_DIFF,
    (daily_trans_cnt - expected_daily_trans_cnt)/expected_daily_trans_cnt AS daily_trans_cnt_DIFF,
    (daily_trans_amnt - expected_daily_trans_amnt)/expected_daily_trans_amnt AS daily_trans_amnt_DIFF
FROM CC_FRAUD_DATA
{LIMIT 10}

Sql started successfully



## Ingesting the data with these new features into splice machine

In [4]:
sdf = splice.df(f"""
SELECT     
    time_offset,
    expected_weekly_trans_cnt,
    expected_weekly_trans_amnt,
    expected_daily_trans_cnt,
    expected_daily_trans_amnt,
    weekly_trans_cnt,
    weekly_trans_amnt,
    daily_trans_cnt,
    daily_trans_amnt,
    rolling_avg_weekly_trans_cnt,
    rolling_avg_weekly_trans_amnt,
    rolling_avg_daily_trans_cnt,
    rolling_avg_daily_trans_amnt,
    MACD_trans_amnt,
    MACD_trans_cnt,
    RSI_trans_amnt,
    RSI_trans_cnt,
    Aroon_trans_amnt,
    Aroon_trans_cnt,
    ADX_trans_amnt,
    ADX_trans_cnt,
    current_balance,
    rolling_avg_balance,
    MACD_balance,
    Aroon_balance,
    RSI_balance,
    ADX_balance,
    credit_score,
    credit_limit,
    amount,
    (weekly_trans_cnt - expected_weekly_trans_cnt) AS weekly_trans_cnt_DIFF,
    (weekly_trans_amnt - expected_weekly_trans_amnt) AS weekly_trans_amnt_DIFF,
    (daily_trans_cnt - expected_daily_trans_cnt) AS daily_trans_cnt_DIFF,
    (daily_trans_amnt - expected_daily_trans_amnt) AS daily_trans_amnt_DIFF,
    CLASS_RESULT
FROM {schema}.CC_FRAUD_DATA""")
sdf.select(sdf.columns[-8:]).describe().show()

  and should_run_async(code)


+-------+--------------------+--------------------+------------------+---------------------+----------------------+--------------------+---------------------+--------------------+
|summary|        CREDIT_SCORE|        CREDIT_LIMIT|            AMOUNT|WEEKLY_TRANS_CNT_DIFF|WEEKLY_TRANS_AMNT_DIFF|DAILY_TRANS_CNT_DIFF|DAILY_TRANS_AMNT_DIFF|        CLASS_RESULT|
+-------+--------------------+--------------------+------------------+---------------------+----------------------+--------------------+---------------------+--------------------+
|  count|              284806|              284806|            284806|               284806|                284806|              284806|               284806|              284806|
|   mean|-4.98388964018057...|-8.01132016626078...|         88.349756| -6.71131682984856...|  -7.48542045247061...|1.507051624149715...| -1.31641434096418...|0.001727491696101908|
| stddev| 0.40363311594252355| 0.33008356676031403|250.12053775374372|   2.3961589325940076|     2.1

## Now engineering features with PySpark
Here, we calculate the z-score for the AMOUNT feature based on class result

In [5]:
from beakerx.object import beakerx
beakerx.pandas_display_table()

In [7]:
from pyspark.sql import functions as F

df_zscore = sdf.join(F.broadcast(sdf.groupBy("CLASS_RESULT").agg(
                        F.stddev_pop("AMOUNT").alias("AMOUNT_sd"), 
                        F.avg("AMOUNT").alias("AMOUNT_avg"))),
             "CLASS_RESULT")\
        .withColumn("AMOUNT_Z_SCORE", (F.col("AMOUNT") - F.col("AMOUNT_avg")) / F.col("AMOUNT_sd")).limit(150)

df_zscore.select('AMOUNT','WEEKLY_TRANS_CNT_DIFF','WEEKLY_TRANS_AMNT_DIFF','DAILY_TRANS_CNT_DIFF','DAILY_TRANS_AMNT_DIFF', 'AMOUNT_Z_SCORE').toPandas()

## We can do the same thing with Koalas if that is preferred

Now calculating the z score normalization by class group for `current_balance` manually with Koalas

In [17]:
from scipy.stats import zscore
import databricks.koalas as ks
ks.set_option('compute.ops_on_diff_frames', True)
kdf = df_zscore.to_koalas()
kdf['CURRENT_BALANCE_Z_SCORE'] = kdf.groupby("CLASS_RESULT").CURRENT_BALANCE.transform(lambda x: zscore(x))
    
kdf[['AMOUNT','WEEKLY_TRANS_CNT_DIFF','WEEKLY_TRANS_AMNT_DIFF','DAILY_TRANS_CNT_DIFF','DAILY_TRANS_AMNT_DIFF', 'AMOUNT_Z_SCORE','CURRENT_BALANCE_Z_SCORE']]

Unnamed: 0,AMOUNT,WEEKLY_TRANS_CNT_DIFF,WEEKLY_TRANS_AMNT_DIFF,DAILY_TRANS_CNT_DIFF,DAILY_TRANS_AMNT_DIFF,AMOUNT_Z_SCORE,CURRENT_BALANCE_Z_SCORE
26,0.89,-1.22995,0.194451,-0.70624,-0.159938,-0.349458,-0.355107
29,1.0,0.425071,3.914584,0.273388,1.862403,-0.349018,-0.148511
65,69.95,-2.212119,1.541944,-0.752285,1.595189,-0.073334,0.649144
19,3.7,-1.111782,-1.006202,2.939002,-0.377953,-0.338222,0.307776
54,19.67,1.285288,-1.663677,1.501788,0.844409,-0.274369,-0.070915
0,322.43,-1.403682,1.402521,-0.019114,-3.992129,0.936162,0.846967
112,23.95,-1.488407,-0.707994,-0.937223,-2.569157,-0.257257,1.007678
113,367.6,-0.501698,-0.07628,-0.569085,0.447015,1.116766,-0.409979
22,11.5,-2.084381,0.413663,-1.935784,-0.578093,-0.307036,-0.296832
130,143.0,2.653485,2.244061,-2.18472,-4.904833,0.218743,0.249188


In [19]:
spark.stop()

# Fantastic!
<blockquote> 
Now you can start building basic and advanced feature engineering tasks in both SQL and PySpark! <br>
    Next Up: <a href='./7.5 Model Creation.ipynb'>Using MLManager to create basic machine learning models.</a>
<footer>Splice Machine</footer>
</blockquote>