In [1]:
# dataset builder
from jenny.dataset_generation import BatchDatasetGenerator

# feature generators
from jenny.dataset_generation import (
    IdFeatureGenerator,
    TimestampFeatureGenerator,
    NumericFeatureGenerator,
    TimeSensitiveNumericFeatureGenerator,
    TextFeatureGenerator,
    CategoryFeatureGenerator
)

# time series modeling
from jenny.dataset_generation import (
    Trend, Seasonality, Noise, NormalPercentageDeviation
)

import pandas as pd

In [3]:

# dataset spec
dataset_generator = BatchDatasetGenerator(
    events_generator=TimeSensitiveNumericFeatureGenerator(
        name="events",
        trend=Trend(
            base_value=100,
            slope=1,
        ),
        seasonality=Seasonality(
            week_days=[0.85, 0.9, 1, 1, 0.9, 0.8, 0.8],
            month_period=[1, 0.9, 1],
            year_months=[1, 0.95, 0.9, 0.9, 0.85, 0.85, 0.85, 0.85, 0.9, 0.9, 0.95, 1],
        ),
        noise=Noise(
            var=0.05,
            seed=123,
        )
    ),
    features=[
        IdFeatureGenerator(
            name="id",
            min_id=1000,
            monotonically_increase=True
        ),
        TimestampFeatureGenerator(
            name="ts"
        ),
        NumericFeatureGenerator(
            name="numeric_feature",
            base_value=500,
            percentage_deviation_generator=NormalPercentageDeviation(
                var=0.8,
                seed=123,
            )
        ),
        TimeSensitiveNumericFeatureGenerator(
            name="numeric_feature_with_trend",
            trend=Trend(
                base_value=500,
                slope=10,
            ),
            noise=Noise(
                var=0.05,
                seed=123,
            )
        ),
        TimeSensitiveNumericFeatureGenerator(
            name="numeric_feature_with_seasonality",
            trend=Trend(
                base_value=500,
                slope=10,
            ),
            noise=Noise(
                var=0.05,
                seed=123,
            ),
            seasonality=Seasonality(
                week_days=[0.9, 1.1, 1.1, 1, 0.9, 0.9, 0.9],
                year_months=[1.1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.2],
            )
        ),
        TextFeatureGenerator(
            name="description",
            max_base_length=120,
        ),
        CategoryFeatureGenerator(
            name="category",
            categories=["A", "B", "C", "D", "E"]
        )
    ]
)

In [16]:
from datetime import datetime

start_ts = datetime(year=2018, month=1, day=1)
dataset_pdf = dataset_generator.generate(start_ts=start_ts, n=1)

In [17]:
# showing generated df
dataset_pdf

Unnamed: 0,id,ts,numeric_feature,numeric_feature_with_trend,numeric_feature_with_seasonality,description,category
0,13143,2018-01-01 13:29:34+00:00,714.270555,513.391910,508.257991,Up piece measure writer return but. Cause cont...,D
1,13144,2018-01-01 10:42:16+00:00,555.606068,503.475379,498.440625,Break future beat admit morning protect. Deal ...,B
2,13145,2018-01-01 09:46:22+00:00,385.965710,444.627143,440.180872,Government eight too threat prevent political ...,C
3,13146,2018-01-01 01:09:38+00:00,616.508493,507.281781,502.208963,Under system rather pressure film care her. Qu...,D
4,13147,2018-01-01 22:25:59+00:00,659.733302,509.983331,504.883498,Girl school attack but. Customer agree himself...,C
...,...,...,...,...,...,...,...
77,13220,2018-01-01 13:35:22+00:00,161.320721,478.832545,474.044220,None him follow you or.,A
78,13221,2018-01-01 10:37:22+00:00,956.432657,528.527041,523.241771,Authority data know series modern reality will...,B
79,13222,2018-01-01 07:48:30+00:00,499.507559,499.969222,494.969530,Television region both fear form partner. Most...,A
80,13223,2018-01-01 09:44:19+00:00,273.409472,485.838092,480.979711,By dream ability realize. If political perform...,E


In [6]:
dataset_pdf.dtypes

id                                                int64
ts                                  datetime64[ns, UTC]
numeric_feature                                 float64
numeric_feature_with_trend                      float64
numeric_feature_with_seasonality                float64
description                                      string
category                                         string
dtype: object

In [133]:
a = dataset_pdf["ts"].min()
a = a.to_pydatetime()
a

datetime.datetime(2018, 1, 1, 0, 12, 4, tzinfo=<UTC>)

In [7]:
# create spark session and convert to spark df
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate()

In [8]:
# dataset_pdf["ts"] = dataset_pdf["ts"].dt.date
dataset_df = spark.createDataFrame(dataset_pdf)

In [9]:
dataset_df.toPandas()

Unnamed: 0,id,ts,numeric_feature,numeric_feature_with_trend,numeric_feature_with_seasonality,description,category
0,1000,2018-01-01 06:51:36,104.351460,475.271966,470.519247,Of various identify unit economy. Go research ...,D
1,1001,2018-01-01 09:37:06,352.885339,490.805334,485.897280,Whom success vote. Name performance modern wro...,E
2,1002,2018-01-01 13:10:21,1015.170105,532.198132,526.876150,Feel to south reality film establish. Through ...,A
3,1003,2018-01-01 19:14:23,577.589768,504.849360,499.800867,Join past name course. Cultural look model cen...,A
4,1004,2018-01-01 16:40:36,868.092360,523.005772,517.775715,Medical her party certainly wide animal newspa...,A
...,...,...,...,...,...,...,...
12138,13138,2018-04-10 10:45:47,456.461046,1481.890870,1630.079957,Executive buy wide bad summer fight establish....,B
12139,13139,2018-04-10 01:27:19,668.562292,1521.394727,1673.534199,Stock executive teacher dark ability. Student ...,D
12140,13140,2018-04-10 06:51:50,859.209687,1556.902804,1712.593085,Election fly arrive mean field against develop...,E
12141,13141,2018-04-10 11:14:02,422.426797,1318.198009,1450.017810,Better pretty make they stop.,D


In [None]:
# profiling spec

from jenny.profiling import (
    ProfilingPipeline,
    GlobalProfiler,
    ColumnStatsProfiler,
    CompletenessProfiler,
)

metrics = ["max", "min", "sum", "avg"]
profiling_pipeline = ProfilingPipeline(
    profilers=[
        GlobalProfiler(),
        ColumnStatsProfiler(
            from_column="id",
            metrics=metrics
        ),
        ColumnStatsProfiler(
            from_column="numeric_feature",
            metrics=metrics
        ),
        CompletenessProfiler(
            from_column="numeric_feature",
        ),
        ColumnStatsProfiler(
            from_column="numeric_feature_with_trend",
            metrics=metrics
        ),
        ColumnStatsProfiler(
            from_column="numeric_feature_with_seasonality",
            metrics=metrics
        ),
        ColumnStatsProfiler(
            from_column="description",
            metrics=metrics,
            not_numeric=True
        ),
        ColumnStatsProfiler(
            from_column="category",
            metrics=metrics,
            not_numeric=True
        )
        ColumnStatsProfiler(
            from_column="category",
            metrics=["count_distinct"]
        )
    ]
)

In [None]:
# run profiling 
profiling_df = profiling_pipeline.run(
    input_df=dataset_df,
    time_column="ts"
)

In [None]:
profiling_df.sort("ts").toPandas()