In [11]:
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.sql.types import *
import os
from functools import reduce
from operator import add
from pyspark.sql import DataFrame
import datetime
import operator
ops = {'+':operator.add, '-':operator.sub, '*':operator.mul, '/':operator.truediv}

In [2]:
def calculate_step_in_seconds(step):
    if step.startswith('year'):
        inSec = 31536000
    elif step.startswith('month'):
        inSec = 2630000
    elif step.startswith('week'):
        inSec = 604800
    elif step.startswith('day'):
        inSec = 86400
    elif step.startswith('hour'):
        inSec = 3600
    elif step.startswith('minute'):
        inSec = 60
    elif step.startswith('second'):
        inSec = 1
    return inSec

def calculate_step_from_string(string):
    s = string.split(' ')
    n = int(s[0])
    step = calculate_step_in_seconds(s[1])
    return n * step

In [3]:
def concat(*args):
    return [arg for arg in args]

In [4]:
def change_column_names(columns):
    return [c.replace('.', '_') for c in columns]

In [10]:
def query_parser(query):
    if query.startswith('TSel'):
        return temporal_selection_parser(query)
    if query.startswith('WSel'):
        return window_selection_parser(query)
    if query.startswith('Shift'):
        return shift_parser(query)
    if query.startswith('TAgg'):
        return temporal_aggregation_parser(query)
    if query.startswith('WAgg'):
        return window_aggregation_parser(query)
    if query.startswith('TProj'):
        return temporal_projection_parser(query)
    
def temporal_projection_parser(query):
    predicate = query[6:-1]
    for op in ops.keys():
        s = predicate.split(op)
        if len(s) == 2:
            if '.' in s[1]:
                n = float(s[1])
            else:
                n = int(s[1])
            return s[0], op, n
    
def temporal_selection_parser(query):
    predicate = query[5:-1]
    if '==' in predicate:
        s = predicate.split('==')
        return s[0], s[1], 'streq'
    
def window_selection_parser(query):
    predicate = query[5:-1]
    s = predicate.split(',')
    return s[0], s[1], None

def shift_parser(query):
    predicate = query[6:-1]
    s = predicate.split(' ')
    return int(s[0]), s[1], None

def temporal_aggregation_parser(query):
    predicate = query[5:-1]
    s = predicate.split(',')
    s2 = s[1].split('(')
    return s[0], s2[1][:-1], s2[0]

def window_aggregation_parser(query):
    predicate = query[5:-1]
    s = predicate.split(',')
    s1 = s[0].split(' ')
    step = calculate_step_in_seconds(s1[1]) * int(s1[0])
    s2 = s[1].split('(')
    return step, s2[1][:-1], s2[0]

In [6]:
def test_time_interval(file, start, end):
    if '=' not in file:
        return False
    f = file.split("=")
    timestamp = int(f[1])
    if timestamp >= start and timestamp <= end:
        return True
    else:
        return False

In [13]:
def change_column_type(df, attribute, type=DoubleType()):
    return df.withColumn(attribute, col(attribute).cast(DoubleType()))