In [1]:
%run io_functions.ipynb

In [3]:
def temporal_selection(query, df):
    attribute, value, q = query_parser(query)
    if q == 'streq':
        df1 = df.select('timestamp', attribute).withColumn(attribute, when(col(attribute)==value, value).otherwise('!'))
    elif q == 'greq':
        value = float(value)
        df1 = change_column_type(df, attribute)
        df1 = df1.select('timestamp', attribute).withColumn(attribute, when(col(attribute)>=value, col(attribute)).otherwise('!'))
    elif q == 'lseq':
        value = float(value)
        df1 = change_column_type(df, attribute)
        df1 = df1.select('timestamp', attribute).withColumn(attribute, when(col(attribute)<=value, col(attribute)).otherwise('!'))
    elif q == 'nmeq':
        value = float(value)
        df1 = change_column_type(df, attribute)
        df1 = df1.select('timestamp', attribute).withColumn(attribute, when(col(attribute)==value, value).otherwise('!'))
    elif q == 'gr':
        value = float(value)
        df1 = change_column_type(df, attribute)
        df1 = df1.select('timestamp', attribute).withColumn(attribute, when(col(attribute)>value, col(attribute)).otherwise('!'))
    elif q == 'ls':
        value = float(value)
        df1 = change_column_type(df, attribute)
        df1 = df1.select('timestamp', attribute).withColumn(attribute, when(col(attribute)<value, col(attribute)).otherwise('!'))
    return df1

In [23]:
def window_selection(query, pythonDateFormat='%d/%m/%Y', path='./', readFromDisk=True, df=None):
    start, end, _ = query_parser(query)
    start_date = datetime.datetime.strptime(start, pythonDateFormat)
    end_date = datetime.datetime.strptime(end, pythonDateFormat)
    start_epoch = int(start_date.timestamp())
    end_epoch = int(end_date.timestamp())
    
    if readFromDisk is True:
        files = [f for f in os.listdir(path) if test_time_interval(f, start_epoch, end_epoch)]
        return unpack(path, files)
    else:
        return df.filter(col('timestamp').between(start_epoch, end_epoch))

In [37]:
def temporal_projection(query, df):
    attribute, op, number = query_parser(query)
    df = change_column_type(df, attribute)
    df = df.withColumn(attribute, ops.get(op)(col(attribute), number))
    return df

In [12]:
def shift(query, df):
    n, duration, _ = query_parser(query)
    step = calculate_step_in_seconds(duration) * n
    df_left = df.select([col(c).alias(c+"_original") for c in df.columns])
    df_right = df.select([col(c).alias(c+"_shifted") for c in df.columns])
    df_left = df_left.withColumn('shifted_timestamp', df_left.timestamp_original + step)
    df_joined = df_left.join(df_right, df_left.shifted_timestamp == df_right.timestamp_shifted, "inner").drop(
        'shifted_timestamp')
    return df_joined

In [40]:
def temporal_aggregation(query, df):
    duration , attribute, agg = query_parser(query)
    step = calculate_step_from_string(duration)
    first_timestamp = df.select('timestamp').first()[0]
    df1 = df.withColumn('first_timestamp', lit(first_timestamp))
    df1 = df1.withColumn('group', ((df1.timestamp - df1.first_timestamp)/lit(step)).cast("integer")).drop(
    'first_timestamp')
    group = df1
    if agg == 'count':
        group = df1.groupBy('group', attribute).count().sort('group')
    elif agg == 'avg':
        df1 = change_column_type(df1, attribute)
        group = df1.groupBy('group').agg(mean(attribute)).sort('group')
    return group

In [39]:
def window_aggregation(query, df):
    step, attribute, agg = query_parser(query)
    window = Window.orderBy('timestamp').rangeBetween(0, step)
    all_original_columns = df.columns
    column_name = agg + '(' + attribute + ')'
    df1 = df
    if agg == 'count':
        df1 = df.withColumn(column_name, (collect_list(attribute).over(window)))
        df1 = df1.select('*', explode(column_name).alias('exploded')).filter(col(attribute) == col('exploded'))\
        .groupBy(*all_original_columns, column_name).count()
        df1 = df1.drop(column_name, 'exploded')
    elif agg == 'avg':
        df1 = change_column_type(df1, attribute)
        df1 = df.withColumn(column_name, avg(attribute).over(window))
    return df1