In [1]:
import pandas as pd
from datetime import datetime

In [2]:
# there should be NLP))
def assign_label_number(label):
    labels = {
        "Sport": 0,
        "Food": 1,
        "Hobby Active": 2,
        "Hobby Passive": 3,
        "Studying": 4,
        "Work": 5,
        "Other": 6
    }
    return labels[label]

In [3]:
# change queue of tasks to dataframe
# it is assumed that input format is like "Sport-90-3-1-9:00-1/1/1970", two last arguments are optional
def tasks_to_df(tasks):
    df = pd.DataFrame(columns=['Label Number', 'Duration', 'Importance', 'Time_Min',
                               'Date_Day', 'Date_Month'])

    for task in tasks:
        task = task.split('-')

        label_num = assign_label_number(task[0])
        duration = task[1]
        importance = task[2]

        if len(task) == 6:
            time = datetime.strptime(task[4], '%H:%M')
            date = datetime.strptime(task[5], '%d/%m/%Y')
            minutes = time.minute + time.hour * 60
            day = date.day
            month = date.month
        elif len(task) == 5:
            if task[4].find('/') != -1:
                date = datetime.strptime(task[4], '%d/%m/%Y')
                minutes = 0
                day = date.day
                month = date.month
            else:
                time = datetime.strptime(task[4], '%H:%M')
                minutes = time.minute + time.hour * 60
                day = 0
                month = 0
        else:
            minutes = 0
            day = 0
            month = 0

        df.loc[len(df)] = {'Label Number': label_num,
                   'Duration': duration,
                   'Importance': importance,
                   'Time_Min': minutes,
                   'Date_Day': day, 'Date_Month': month
                   }

    df.sort_values(by=['Importance'], inplace=True, ascending=False)
    df.reset_index(drop=True, inplace=True)

    return df

In [4]:
# if input format "AML-11:00-90-2 Sunday 3"  or  "AML-11:00-90-2 31 3"  or  "AML-11:00-90-1/1/1970"
def events_to_arr_of_dict(events):
    arr_of_dict = []
    return arr_of_dict

In [5]:
from sklearn.preprocessing import MinMaxScaler
from feature_engine.creation import CyclicalFeatures


def preprocessor(data):

    if not isinstance(data, pd.DataFrame):
        # temporary solution for tasks/events distinction
        if len(data[0].split("-")) > 4:
            data = tasks_to_df(data)
        else:
            data = events_to_arr_of_dict(data)
            # special case to skip the neural network
            pass
    else:
        data["Start Time"] = pd.to_datetime(data["Start Time"], format="%H:%M")
        data["Time_Min"] = data["Start Time"].dt.minute + data["Start Time"].dt.hour * 60

        data["Date"] = pd.to_datetime(data["Date"], format="%d/%m/%Y")
        data["Date_Day"] = data["Date"].dt.day
        data["Date_Month"] = data["Date"].dt.month

        data.drop(columns=["Start Time", "Date"], inplace=True)


    scaler = MinMaxScaler()
    data['Duration'] = scaler.fit_transform(data[['Duration']])

    cyclical = CyclicalFeatures(variables=['Time_Min', 'Date_Day', 'Date_Month'], drop_original=True, )
    data = cyclical.fit_transform(data)

    print(data.head())
    return data

In [6]:
data = [
    "Sport-90-3-1-9:00-1/1/1970",
    "Food-60-2-1-10:00-1/1/1970",
    "Hobby Active-120-0-1-11:00-1/1/1970",
    "Hobby Passive-60-0-1-12:00-1/1/1970",
    "Studying-120-3-1-13:00",
    "Work-120-3-1-1/1/1970",
]
preprocessor(data)

   Label Number  Duration Importance  Time_Min_sin  Time_Min_cos  \
0             0       0.5          3 -9.350162e-01     -0.354605   
1             4       1.0          3 -2.449294e-16      1.000000   
2             5       1.0          3  0.000000e+00      1.000000   
3             1       0.0          2 -9.927089e-01      0.120537   
4             2       1.0          0 -8.229839e-01      0.568065   

   Date_Day_sin  Date_Day_cos  Date_Month_sin  Date_Month_cos  
0 -2.449294e-16           1.0   -2.449294e-16             1.0  
1  0.000000e+00           1.0    0.000000e+00             1.0  
2 -2.449294e-16           1.0   -2.449294e-16             1.0  
3 -2.449294e-16           1.0   -2.449294e-16             1.0  
4 -2.449294e-16           1.0   -2.449294e-16             1.0  


Unnamed: 0,Label Number,Duration,Importance,Time_Min_sin,Time_Min_cos,Date_Day_sin,Date_Day_cos,Date_Month_sin,Date_Month_cos
0,0,0.5,3,-0.9350162,-0.354605,-2.449294e-16,1.0,-2.449294e-16,1.0
1,4,1.0,3,-2.449294e-16,1.0,0.0,1.0,0.0,1.0
2,5,1.0,3,0.0,1.0,-2.449294e-16,1.0,-2.449294e-16,1.0
3,1,0.0,2,-0.9927089,0.120537,-2.449294e-16,1.0,-2.449294e-16,1.0
4,2,1.0,0,-0.8229839,0.568065,-2.449294e-16,1.0,-2.449294e-16,1.0
5,3,0.0,0,-0.4647232,0.885456,-2.449294e-16,1.0,-2.449294e-16,1.0


In [7]:
data = pd.read_csv('schedule_v3.csv')
preprocessor(data)

   Label Number  Duration  Importance  Time_Min_sin  Time_Min_cos  \
0             3  0.571429           1      0.988948     -0.148264   
1             5  0.714286           2      0.818625     -0.574329   
2             0  0.142857           0      0.475947     -0.879474   
3             1  0.142857           1      0.293250     -0.956036   
4             6  0.000000           0      0.099046     -0.995083   

   Date_Day_sin  Date_Day_cos  Date_Month_sin  Date_Month_cos  
0     -0.968077     -0.250653       -0.866025            -0.5  
1     -0.968077     -0.250653       -0.866025            -0.5  
2     -0.968077     -0.250653       -0.866025            -0.5  
3     -0.968077     -0.250653       -0.866025            -0.5  
4     -0.968077     -0.250653       -0.866025            -0.5  


Unnamed: 0,Label Number,Duration,Importance,Time_Min_sin,Time_Min_cos,Date_Day_sin,Date_Day_cos,Date_Month_sin,Date_Month_cos
0,3,0.571429,1,0.988948,-0.148264,-0.968077,-0.250653,-8.660254e-01,-0.5
1,5,0.714286,2,0.818625,-0.574329,-0.968077,-0.250653,-8.660254e-01,-0.5
2,0,0.142857,0,0.475947,-0.879474,-0.968077,-0.250653,-8.660254e-01,-0.5
3,1,0.142857,1,0.293250,-0.956036,-0.968077,-0.250653,-8.660254e-01,-0.5
4,6,0.000000,0,0.099046,-0.995083,-0.968077,-0.250653,-8.660254e-01,-0.5
...,...,...,...,...,...,...,...,...,...
967,4,0.714286,2,-0.993309,0.115485,0.201299,0.979530,-2.449294e-16,1.0
968,1,0.142857,1,-0.901991,0.431754,0.201299,0.979530,-2.449294e-16,1.0
969,6,0.285714,0,-0.712928,0.701237,0.201299,0.979530,-2.449294e-16,1.0
970,3,0.285714,1,-0.446609,0.894729,0.201299,0.979530,-2.449294e-16,1.0
