# Generate Data 

## Pandas dataframes 

In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timezone
from sklearn.datasets import make_hastie_10_2
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

def generate_entities(size):
    return np.random.choice(size, size=size, replace=False)

def generate_data(entities, year=2021, month=10, day=1) -> pd.DataFrame:
    n_samples=len(entities)
    X, y = make_hastie_10_2(n_samples=n_samples, random_state=0)
    df = pd.DataFrame(X, columns=["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9"])
    df["y"]=y
    df['entity_id'] = entities
    df['datetime'] = pd.to_datetime(
            np.random.randint(
                datetime(year, month, day, 0,tzinfo=timezone.utc).timestamp(),
                datetime(year, month, day, 22,tzinfo=timezone.utc).timestamp(),
                size=n_samples),
        unit="s", #utc=True
    )
    df['created'] = pd.to_datetime(
            datetime.now(), #utc=True
            )
    df['month_year'] = pd.to_datetime(datetime(year, month, day, 0, tzinfo=timezone.utc), utc=True)
    return df

entities=generate_entities(1000000)

entity_df = pd.DataFrame(data=entities, columns=['entity_id'])
entity_df["event_timestamp"]=datetime(2021, 1, 14, 23, 59, 42, tzinfo=timezone.utc)

## Create Delta Lake 

In [6]:
import time
for d in range(1,15):
    print(f"DAY {d}")
    
    start_time = time.time()
    data=generate_data(entities,month=1, day=d)
    print(f"## GENERATED - {time.time() - start_time} s")
    
    start_time = time.time()
    spark.createDataFrame(data).write.format("delta").mode("append").partitionBy('month_year').save("./dataset/all")
    print(f"## DELTA CREATED - {time.time() - start_time} s")

DAY 1
## GENERATED - 1.9863653182983398 s
## DELTA CREATED - 118.46784734725952 s
DAY 2
## GENERATED - 2.2533488273620605 s
## DELTA CREATED - 113.56314516067505 s
DAY 3
## GENERATED - 2.090444326400757 s
## DELTA CREATED - 117.54949474334717 s
DAY 4
## GENERATED - 2.137775421142578 s
## DELTA CREATED - 113.69700503349304 s
DAY 5
## GENERATED - 2.0107674598693848 s
## DELTA CREATED - 112.49230170249939 s
DAY 6
## GENERATED - 2.04490327835083 s
## DELTA CREATED - 116.83132553100586 s
DAY 7
## GENERATED - 2.12314772605896 s
## DELTA CREATED - 114.3579614162445 s
DAY 8
## GENERATED - 2.1742141246795654 s
## DELTA CREATED - 115.68657755851746 s
DAY 9
## GENERATED - 2.001004695892334 s
## DELTA CREATED - 112.91505312919617 s
DAY 10
## GENERATED - 2.1537675857543945 s
## DELTA CREATED - 113.79394125938416 s
DAY 11
## GENERATED - 2.077458620071411 s
## DELTA CREATED - 116.54374861717224 s
DAY 12
## GENERATED - 2.2862818241119385 s
## DELTA CREATED - 119.25584959983826 s
DAY 13
## GENERATED - 

## Delta Lake history 

In [7]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, "./dataset/all")

fullHistoryDF = deltaTable.history()
fullHistoryDF.show()

+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|     13|2022-02-11 01:08:...|  null|    null|    WRITE|{mode -> Append, ...|null|    null|     null|         12|  Serializable|         true|{numFiles -> 12, ...|        null|Apache-Spark/3.2....|
|     12|2022-02-11 01:06:...|  null|    null|    WRITE|{mode -> Append, ...|null|    null|     null|         11|  Serializable|         true|{numFiles -> 12, ...|        null|Apache-Spark/3.2....|
|     11|2

# Feast Apply 

In [12]:
!rm -r .ipynb_checkpoints
from feast.repo_operations import apply_total
from feast.repo_config import load_repo_config
from pathlib import Path

repo = Path('/home/jovyan/feast-spark/feature_repo/')

repo_config = load_repo_config(repo)
apply_total(repo_config, repo, True)

Created entity [1m[32mentity_id[0m
Created feature view [1m[32mmy_statistics[0m

Created sqlite table [1m[32mrepo_my_statistics[0m



In [18]:
edf = entity_df[entity_df.entity_id<=500]
edf

Unnamed: 0,entity_id,event_timestamp
173,396,2021-01-14 23:59:42+00:00
2142,202,2021-01-14 23:59:42+00:00
2248,433,2021-01-14 23:59:42+00:00
4011,418,2021-01-14 23:59:42+00:00
6016,163,2021-01-14 23:59:42+00:00
...,...,...
991983,257,2021-01-14 23:59:42+00:00
992539,414,2021-01-14 23:59:42+00:00
995372,458,2021-01-14 23:59:42+00:00
996551,87,2021-01-14 23:59:42+00:00


In [20]:
from feast import FeatureStore
import pandas as pd

import time
from feast_custom_offline_store.spark import SparkOfflineStore

SparkOfflineStore.spark = spark
store = FeatureStore(repo_path=".")

start_time = time.time()
training_df = store.get_historical_features(
    entity_df=edf, 
    features = [
        'my_statistics:f0',
        'my_statistics:f1',
        'my_statistics:f2',
        'my_statistics:f3',
        'my_statistics:f4',
        'my_statistics:f5',
        'my_statistics:f6',
        'my_statistics:f7',
        'my_statistics:f8',
        'my_statistics:f9',
        'my_statistics:y',
    ],
).to_df()


print("--- %s seconds ---" % (time.time() - start_time))

training_df

--- 3.106114625930786 seconds ---


Unnamed: 0,entity_id,event_timestamp,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,y
0,26,2021-01-14 23:59:42,0.771540,-0.245521,0.831709,-0.188584,-1.310991,1.154804,1.090323,0.517180,-0.264450,1.119810,-1.0
1,474,2021-01-14 23:59:42,0.232990,0.620908,-1.637337,-0.642802,0.259246,1.477162,1.378601,-0.440399,-0.163920,1.073939,-1.0
2,29,2021-01-14 23:59:42,-1.121266,-0.335133,-0.111160,0.275948,0.640860,-0.909624,1.679997,0.016758,0.574729,-2.348436,1.0
3,65,2021-01-14 23:59:42,-0.220508,2.011135,0.196614,1.092113,0.985672,1.681704,-0.669689,1.291223,0.726470,-2.133538,1.0
4,191,2021-01-14 23:59:42,-0.236036,0.376173,-0.272282,0.956299,-0.772731,1.271947,-0.427319,0.394538,-0.273124,1.039412,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,458,2021-01-14 23:59:42,2.545778,-0.106637,0.257218,-0.129079,2.290062,0.759244,0.937336,0.429199,0.667794,0.103962,1.0
497,340,2021-01-14 23:59:42,0.065813,-1.895800,-0.129820,0.096086,0.953465,0.377935,-0.644957,0.110571,-1.973347,-0.111153,-1.0
498,456,2021-01-14 23:59:42,-0.830026,0.679147,1.658516,-0.863514,0.382954,0.280409,-0.755902,0.874343,0.749202,1.633818,1.0
499,469,2021-01-14 23:59:42,1.162467,-0.405825,0.884610,-0.948185,-0.131180,1.446960,-0.434682,-0.630037,-0.780209,1.401276,-1.0
