# Data Cleaning

In [1]:
import pandas as pd
from typing import Tuple
import numpy as np

## Step 0
- You will get a csv file from us. Load it in your language/environment.
- Explore the data in it.

In [2]:
data_frame = pd.read_csv('data-cleaning.csv')

data_frame['timestamp'] = pd.to_datetime(data_frame['timestamp'])
data_frame = data_frame.set_index('timestamp')

data_frame

Unnamed: 0_level_0,temperature
timestamp,Unnamed: 1_level_1
2018-08-10 21:20:00,16.5
2018-08-10 21:30:00,16.4
2018-08-10 21:40:00,16.1
2018-08-10 21:50:00,16.3
2018-08-10 22:00:00,16.3
...,...
2019-09-25 23:10:00,12.7
2019-09-25 23:20:00,12.8
2019-09-25 23:30:00,12.6
2019-09-25 23:40:00,12.5


## Step 1
- Implement a Interquartile range filter (IRQ)* and a z-score filter*.

In [3]:
def iqr(table: pd.DataFrame) -> Tuple[np.float64, np.float64]:
    q_25 = table.quantile(0.25)
    q_75 = table.quantile(0.75)
    iqr_value = q_75 - q_25
    lower_limit = q_25 - 1.5 * iqr_value
    upper_limit = q_75 + 1.5 * iqr_value
    return np.float64(lower_limit), np.float64(upper_limit)


def z_score(table: pd.DataFrame) -> Tuple[np.float64, np.float64]:
    mean = table['temperature'].mean()
    std = table['temperature'].std()
    return mean - 3 * std, mean + 3 * std

- Find outliers in the data.

In [4]:
lower, upper = iqr(data_frame)
print(f'IQR outliers < {lower} or > {upper}')

lower, upper = z_score(data_frame)
print(f'z-score outliers < {lower} or > {upper}')

IQR outliers < -14.150000000000002 or > 37.45
z-score outliers < -13.162215781558539 or > 36.95015993112411


- Replace outliers with NA values.


In [5]:
data_frame['temperature'] = data_frame['temperature'].where(data_frame['temperature'].between(lower, upper))
print(f'Replaced {data_frame["temperature"].isna().sum()} outlier(s) with NaN.')

Replaced 2 outlier(s) with NaN.


## Step 2
- Fill all missing data points with NA.


In [6]:
old_size = len(data_frame['temperature'])
data_frame = data_frame.resample('10Min').asfreq()
print(f'Filled {len(data_frame["temperature"]) - old_size} gaps with NaN.')

Filled 42 gaps with NaN.


- Implement a step interpolation* and a linear interpolation*.

In [1]:
def interpolate_linear(start, end, percent):
    return (1 - percent) * start + percent * end


def interpolate_step(start, end, percent):
    return start if percent < 0.5 else end


def interpolate(data_frame: pd.DataFrame, interpolate_function):
    def calculate_interpolation(start, end, timestamps):
        size = len(timestamps)
        percent_in_missing_values = (((i + 1) / (size + 1), timestamp) for i, timestamp in enumerate(timestamps))
        for percent, timestamp in percent_in_missing_values:
            data_frame['temperature'][timestamp] = round(interpolate_function(start, end, percent), 1)

        print(data_frame[timestamps[0]:timestamps[-1]])

    last_valid_temperature = None
    gap_timestamps = []

    for timestamp, temperature in data_frame.itertuples():
        if not np.isnan(temperature):
            # valid value
            if gap_timestamps and last_valid_temperature is not None:
                # end of gap chain reached, now interpolate and reset
                calculate_interpolation(last_valid_temperature, temperature, gap_timestamps)
                print(f'Interpolated between {gap_timestamps[0]} and {timestamp}.')
                gap_timestamps = []
            last_valid_temperature = temperature
        else:
            # store NA row
            gap_timestamps.append(timestamp)

    return data_frame

NameError: name 'pd' is not defined

- Replace all NA values with the interpolated values.

In [8]:
nan_count = data_frame['temperature'].isna().sum()
data_frame = interpolate(data_frame, interpolate_linear) # interpolate_linear or interpolate_step
print(f'Interpolated {nan_count - data_frame["temperature"].isna().sum()} values.')

                     temperature
timestamp                       
2018-10-19 08:00:00         10.6
Interpolated between 2018-10-19 08:00:00 and 2018-10-19 08:10:00.
                     temperature
timestamp                       
2018-11-11 18:30:00          9.8
2018-11-11 18:40:00          9.8
2018-11-11 18:50:00          9.8
2018-11-11 19:00:00          9.9
2018-11-11 19:10:00          9.9
2018-11-11 19:20:00          9.9
2018-11-11 19:30:00          9.9
2018-11-11 19:40:00          9.9
2018-11-11 19:50:00          9.9
2018-11-11 20:00:00         10.0
2018-11-11 20:10:00         10.0
2018-11-11 20:20:00         10.0
Interpolated between 2018-11-11 18:30:00 and 2018-11-11 20:30:00.
                     temperature
timestamp                       
2019-01-27 23:40:00          4.3
2019-01-27 23:50:00          4.2
2019-01-28 00:00:00          4.2
2019-01-28 00:10:00          4.1
2019-01-28 00:20:00          4.0
2019-01-28 00:30:00          3.9
2019-01-28 00:40:00          3.9
2019-01-28