In [None]:
import os
import re
import json
import fnmatch
import numpy as np
import pandas as pd
from datetime import datetime
from itertools import product

In [None]:
# Setting filepaths
#
# Get Current Working Directory
CWD = os.getcwd()

# Set Relative and Absolute filepaths for raw data
RAW_DATA_RELPATH = os.path.join(os.pardir, os.pardir, "data", "raw")
RAW_DATA_ABSPATH = os.path.join(CWD, RAW_DATA_RELPATH)

# Set Relative and Absolute filepaths for clean data
CLEAN_DATA_RELPATH = os.path.join(os.pardir, os.pardir, "data", "clean")
CLEAN_DATA_ABSPATH = os.path.join(CWD, CLEAN_DATA_RELPATH)

In [None]:
# Find data files
datafiles = fnmatch.filter(os.listdir(RAW_DATA_ABSPATH), "*time_entries*.csv")
datafiles

In [None]:
# Concatenate all data files into single dataframe
filename = os.path.join(RAW_DATA_ABSPATH, datafiles[0])
df = pd.read_csv(
    filename,
    converters={"Duration": pd.to_timedelta},
    parse_dates={
        "dt_start": ["Start date", "Start time"],
        "dt_end": ["End date", "End time"],
    },
)

for datafile in datafiles[1:]:
    filename = os.path.join(RAW_DATA_ABSPATH, datafile)
    tmpdf = pd.read_csv(
        filename,
        converters={"Duration": pd.to_timedelta},
        parse_dates={
            "dt_start": ["Start date", "Start time"],
            "dt_end": ["End date", "End time"],
        },
    )
    df = pd.concat([df, tmpdf])

# Drop unwanted columns
df.drop(
    ["User", "Email", "Client", "Task", "Billable", "Tags", "Amount ()"],
    axis=1,
    inplace=True,
)

# Add `day_of_week` and `week_number` columns
df["day_of_week"] = df["dt_start"].dt.day_name()
df["week_number"] = df["dt_start"].dt.isocalendar().week

# Drop rows relating to weekends
df[~df.day_of_week.str.contains("Saturday") & ~df.day_of_week.str.contains("Sunday")]

# Reset the index
df.reset_index(drop=True, inplace=True)

# Sort by `dt_start` column
df.sort_values("dt_start", axis=0, inplace=True, ignore_index=True)

df.head()

In [None]:
# Anonymising
import string
from collections import Counter

# Anonymise the `Project` column
projects = Counter(df["Project"].values)
project_converters = {
    project: f"Project {string.ascii_uppercase[i]}"
    for i, project in enumerate(projects.keys())
}

# Anonymise the `Description` column
task_converters = {
    """
    REDACTED
    """
}

# Add empty Tasks column
df["Task"] = np.zeros(len(df))

# Begin replacing columns
for i, row in df.iterrows():
    # Replace Project
    df.loc[i, "Project"] = project_converters[row["Project"]]

    # Insert Task
    for key, values in task_converters.items():
        if row["Description"] in values:
            df.loc[i, "Task"] = key

# Drop the `Description` column
df.drop("Description", axis=1, inplace=True)

df.head()

In [None]:
def construct_filepath(list_of_files: list, filepath: os.path) -> os.path:
    """
    Construct the filename and path to output a CSV file to. This function
    concatenates dates from the filenames of all data files that were used
    in constructing the pandas DataFrame to save and creates a date range
    that describes all the data for the filename.

    Args
    ----
      list_of_files (list): The list of all the datafiles the DataFrame was
          constructed from
      filepath (os.path): The filepath to the output clean data dir

    Returns
    -------
      (os.path): The constructed filepath and name for the output CSV file
    """
    found_dates = []
    date_pattern = re.compile(r"[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]")

    for datafile in list_of_files:
        found_dates.extend(re.findall(date_pattern, datafile))

    found_dates.sort(key=lambda date: datetime.strptime(date, "%Y-%m-%d"))
    filename = f"{found_dates[0]}_{found_dates[-1]}.csv"

    return os.path.join(filepath, filename)


def save_to_csv(
    df_to_save: pd.DataFrame,
    list_of_files: list,
    filepath: os.path = CLEAN_DATA_ABSPATH,
) -> None:
    """
    Function to save a DataFrame to a CSV file

    Args
    ----
      df_to_save (pd.DataFrame): The pandas DataFrame to save as a CSV file
      list_of_files (list): List of data filenames that the DataFrame has been
          constructed from
      filepath (os.path): The filepath to output to CSV file to
          Default: CLEAN_DATA_ABSPATH

    Returns
    -------
      None
    """
    resp = input("Would you like to save this file to the `data/clean` dir? [yes/no]: ")

    if re.match("[Y|y][E|e][S|s]", resp):
        filename = construct_filepath(list_of_files, filepath)
        df_to_save.to_csv(filename, index=False)

    return None


save_to_csv(df, datafiles)

In [None]:
# Read in timeline data
timeline_path = os.path.join(RAW_DATA_ABSPATH, "timeline.json")

with open(timeline_path) as stream:
    timeline = json.load(stream)

In [None]:
# Parse timeline data into a DataFrame
tl_df = pd.DataFrame(columns=["Tool", "Title", "dt_start", "dt_end", "idle"])

for item in timeline:
    tl_df = tl_df.append(
        {
            "Tool": item["filename"],
            "Title": item["title"],
            "dt_start": datetime.fromtimestamp(item["start_time"]),
            "dt_end": datetime.fromtimestamp(item["end_time"]),
        },
        ignore_index=True,
    )

In [None]:
# Drop unwanted rows
tool_to_be_dropped = ["Notion"]
title_to_be_dropped = ["WhatsApp", "Telegram Web", "YouTube", "Twitter", "Facebook"]

labels_to_drop = product(tool_to_be_dropped, title_to_be_dropped)
for tool, title in labels_to_drop:
    tl_df.drop(
        tl_df.index[(tl_df["Tool"] == tool) | (tl_df["Title"] == title)].tolist(),
        inplace=True,
    )

In [None]:
# Modify some tool names
tool_converters = {
    "GitHub": [
        "Pull Request",
        "GitHub",
        "Issue",
        "Branches",
        "Organization profile",
        "Billing",
        "alan-turing-institute",
    ],
    "Gitter": ["Gitter"],
    "Azure": ["Pipelines", "Microsoft Azure"],
    "Travis": ["Travis CI"],
    "Harvest": ["Harvest"],
}

prod = product(tl_df.iterrows(), tool_converters.items())
for p in prod:
    (i, row), (tool, tool_list) = p
    truthy_list = list(filter(lambda v: v in row["Title"], tool_list))
    if len(truthy_list) > 0:
        tl_df.loc[i, "Tool"] = tool

In [None]:
# Rename instances of `Code` to `VSCode`
tl_df.loc[tl_df["Tool"] == "Code", "Tool"] = "VSCode"

In [None]:
# Create a duration column
tl_df["Duration"] = tl_df["dt_end"] - tl_df["dt_start"]

In [None]:
# Drop `Title` column, sort by `dt_start` column, and drop NAs
tl_df = (
    tl_df.drop("Title", axis=1).sort_values("dt_start").dropna().reset_index(drop=True)
)
tl_df.head()

In [None]:
# Save to a CSV file
out_path = os.path.join(CLEAN_DATA_ABSPATH, "timeline.csv")
tl_df.to_csv(out_path, index=False)