## 01 - Pandas Refresher & Missing Values Treatment

- https://github.com/PacktPublishing/Modern-Time-Series-Forecasting-with-Python/blob/main/notebooks/Chapter02/01%20-%20Pandas%20Refresher%20%26%20Missing%20Values%20Treatment.ipynb

In [1]:

import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings('ignore')
plt.style.use("seaborn-v0_8-whitegrid")
%matplotlib inline

pd.options.display.max_columns = 999

In [2]:
#!pip install git+https://github.com/TimeSynth/TimeSynth.git

In [3]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"
#pio.kaleido.scope.mathjax = None
from pathlib import Path
from tqdm.autonotebook import tqdm
from itertools import cycle
%load_ext autoreload
%autoreload 2
np.random.seed()
tqdm.pandas()

### pandas datetime operations, indexing, and slicing

In [4]:
df = pd.read_excel("https://archive.ics.uci.edu/ml/machine-learning-databases/00247/data_akbilgic.xlsx", skiprows=1)

df.head()

Unnamed: 0,date,ISE,ISE.1,SP,DAX,FTSE,NIKKEI,BOVESPA,EU,EM
0,2009-01-05,0.035754,0.038376,-0.004679,0.002193,0.003894,0.0,0.03119,0.012698,0.028524
1,2009-01-06,0.025426,0.031813,0.007787,0.008455,0.012866,0.004162,0.01892,0.011341,0.008773
2,2009-01-07,-0.028862,-0.026353,-0.030469,-0.017833,-0.028735,0.017293,-0.035899,-0.017073,-0.020015
3,2009-01-08,-0.062208,-0.084716,0.003391,-0.011726,-0.000466,-0.040061,0.028283,-0.005561,-0.019424
4,2009-01-09,0.00986,0.009658,-0.021533,-0.019873,-0.01271,-0.004474,-0.009764,-0.010989,-0.007802


In [5]:
df.dtypes

date       datetime64[ns]
ISE               float64
ISE.1             float64
SP                float64
DAX               float64
FTSE              float64
NIKKEI            float64
BOVESPA           float64
EU                float64
EM                float64
dtype: object

In [6]:
df['date'] = pd.to_datetime(df['date'], yearfirst=True)
df['date'].dtypes

dtype('<M8[ns]')

In [7]:
df.head()

Unnamed: 0,date,ISE,ISE.1,SP,DAX,FTSE,NIKKEI,BOVESPA,EU,EM
0,2009-01-05,0.035754,0.038376,-0.004679,0.002193,0.003894,0.0,0.03119,0.012698,0.028524
1,2009-01-06,0.025426,0.031813,0.007787,0.008455,0.012866,0.004162,0.01892,0.011341,0.008773
2,2009-01-07,-0.028862,-0.026353,-0.030469,-0.017833,-0.028735,0.017293,-0.035899,-0.017073,-0.020015
3,2009-01-08,-0.062208,-0.084716,0.003391,-0.011726,-0.000466,-0.040061,0.028283,-0.005561,-0.019424
4,2009-01-09,0.00986,0.009658,-0.021533,-0.019873,-0.01271,-0.004474,-0.009764,-0.010989,-0.007802


In [8]:
df.date.min(), df.date.max()

(Timestamp('2009-01-05 00:00:00'), Timestamp('2011-02-22 00:00:00'))

In [9]:
print(f"""
Date: {df.date.iloc[0]}
Day of year: {df.date.dt.day_of_year.iloc[0]}
Day of week: {df.date.dt.dayofweek.iloc[0]}
Month: {df.date.dt.month.iloc[0]}
Month Name: {df.date.dt.month_name().iloc[0]}
Quarter: {df.date.dt.quarter.iloc[0]}
Year: {df.date.dt.year.iloc[0]}
ISO Week: {df.date.dt.isocalendar().week.iloc[0]}
""")


Date: 2009-01-05 00:00:00
Day of year: 5
Day of week: 0
Month: 1
Month Name: January
Quarter: 1
Year: 2009
ISO Week: 2



In [10]:
df.set_index("date", inplace=True)

# Select all data after 2010-01-04(including)
df["2010-01-04":]
# Select all data between 2010-01-04 and 2010-02-06(not including)
df["2010-01-04": "2010-02-06"]
# Select data 2010 and before
df[: "2010"]
# Select data between 2010-01 and 2010-06(both including)
df["2010-01": "2010-06"]

Unnamed: 0_level_0,ISE,ISE.1,SP,DAX,FTSE,NIKKEI,BOVESPA,EU,EM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-04,0.010229,0.014478,0.015916,0.000000,0.016018,0.000000,0.000000,0.016778,0.008399
2010-01-05,0.013898,0.024019,0.003111,-0.002722,0.004028,0.002535,0.002780,0.001316,0.008067
2010-01-06,0.007957,0.005706,0.000545,0.000409,0.001357,0.004635,0.006938,0.000586,0.005330
2010-01-07,0.007772,0.007498,0.003993,-0.002484,-0.000597,-0.004650,-0.003938,-0.000360,-0.006375
2010-01-08,-0.003189,0.000835,0.002878,0.003027,0.001356,0.010862,-0.002672,0.003554,0.001229
...,...,...,...,...,...,...,...,...,...
2010-06-24,0.000166,-0.008024,-0.016946,-0.014455,-0.015236,0.000467,-0.018948,-0.017801,-0.006717
2010-06-25,-0.000920,-0.003591,0.002855,-0.007366,-0.010585,-0.019411,0.013778,-0.007954,-0.002311
2010-06-28,0.010132,0.016951,-0.002036,0.014168,0.004981,-0.004481,-0.009283,0.009125,0.000540
2010-06-29,-0.021816,-0.028443,-0.031508,-0.033893,-0.031547,-0.012798,-0.035613,-0.034903,-0.021033


### Handling missing data

In [11]:
def format_plot(fig, legends, font_size=15, title_font_size=20):
    names = cycle(legends)
    fig.for_each_trace(lambda t: t.update(name=next(names)))
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
        titlefont={"size": title_font_size},
        legend_title=None,
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.98,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            title_text="Value",
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
        xaxis=dict(
            title_text="Day",
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
    )
    return fig

In [12]:
df = pd.read_csv("https://www.data.act.gov.au/resource/94a5-zqnn.csv", sep=",")
#Selecting one location and pm2.5
df = df.loc[df.name=="Monash", ['datetime', 'pm2_5_1_hr']]
df.datetime = pd.to_datetime(df.datetime)
df.sort_values("datetime", inplace=True)
df.set_index("datetime", inplace=True)
# introducing a missing value by nulling out the values
df.head()

Unnamed: 0_level_0,pm2_5_1_hr
datetime,Unnamed: 1_level_1
2023-10-11 18:00:00,2.55
2023-10-11 19:00:00,3.59
2023-10-11 20:00:00,3.66
2023-10-11 21:00:00,2.79
2023-10-11 22:00:00,2.72


In [13]:
df.tail()

Unnamed: 0_level_0,pm2_5_1_hr
datetime,Unnamed: 1_level_1
2023-10-25 12:00:00,5.92
2023-10-25 13:00:00,5.47
2023-10-25 14:00:00,4.78
2023-10-25 15:00:00,4.52
2023-10-25 16:00:00,2.45


In [14]:
df.loc["2023-10-12 7:00":"2023-10-12 09:00", "pm2_5_1_hr"] = np.nan
df.head()

Unnamed: 0_level_0,pm2_5_1_hr
datetime,Unnamed: 1_level_1
2023-10-11 18:00:00,2.55
2023-10-11 19:00:00,3.59
2023-10-11 20:00:00,3.66
2023-10-11 21:00:00,2.79
2023-10-11 22:00:00,2.72


In [15]:
df = df.loc["2023-10-12":"2023-10-12 12:00", "pm2_5_1_hr"].to_frame()
fig = px.line(df, x=df.index, y="pm2_5_1_hr", title="Missing Values in PM2.5")
fig = format_plot(fig, ["Original"])
fig.show()

In [16]:
_df = df.copy()
#Forward Fill
_df["ffill"] = _df['pm2_5_1_hr'].ffill()
# Backward Fill
_df["bfill"] = _df['pm2_5_1_hr'].bfill()
# Mean Value Fil
_df["mean_value"] = _df['pm2_5_1_hr'].fillna(df['pm2_5_1_hr'].mean())

In [17]:
#Plotting
plot_df = pd.melt(_df.reset_index(), id_vars="datetime", var_name="series")
fig = px.line(plot_df, x="datetime", y=["value"], line_dash="series", title="Forward, Backward, and Mean Value Fill")
fig = format_plot(fig, ['Original', 'Forward Fill', 'Backward Fill', "Mean Value Fill"])
fig.show()

### Interpolation

In [18]:
_df = df.copy()
_df["linear_interpolation"] = _df['pm2_5_1_hr'].interpolate(method="linear")
_df["nearest_interpolation"] = _df['pm2_5_1_hr'].interpolate(method="nearest")

In [19]:
#Plotting
plot_df = pd.melt(_df.reset_index(), id_vars="datetime", var_name="series")
fig = px.line(plot_df, x="datetime", y=["value"], line_dash="series", title="Linear and Nearest Interpolation")
fig = format_plot(fig, ['Original', "Linear Interpolation", "Nearest Interpolation"])
fig.show()

In [20]:
_df = df.copy()
_df["spline_interpolation"] = _df['pm2_5_1_hr'].interpolate(method="spline", order=2)
_df["polynomial_interpolation"] = _df['pm2_5_1_hr'].interpolate(method="polynomial", order=5)

In [21]:
#Plotting
plot_df = pd.melt(_df.reset_index(), id_vars="datetime", var_name="series")
fig = px.line(plot_df, x="datetime", y=["value"], line_dash="series", title="Spline and Polynomial Interpolation")
fig = format_plot(fig, ['Original', "2nd Order Spline Interpolation", "5th Order Polynomial Interpolation"])
fig.show()