<a href="https://colab.research.google.com/github/ryuzuiin/waterlevelforecasting/blob/main/%E6%99%82%E7%B3%BB%E5%88%97%E5%88%86%E6%9E%90.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, kpss, acf, pacf
from statsmodels.tsa.arima_model import ARIMA

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from statsmodels.stats.diagnostic import acorr_ljungbox
from scipy.stats import shapiro

# Step 1. 定常性検定



*   ADF test
*   KPSS test(trend stationary)



In [None]:
def is_stationary(data, significance_level=0.05):
  '''
  １．定常性検定
  input : time seires, significance level
  output : bool value
  '''
  result = adfuller(ts)
  print('ADF Statistic: %f' % result[0])
  print('p-value: %f' % result[1])
  return result[1] <= significance_level

def is_trend_stationary(ts, significance_level=0.05):
  '''
  １．定常性検定for trend
  input : time seires
  output : booleann value
  '''
  result = kpss(ts, regression='ct')
  print('KPSS Statistic: %f' % result[0])
  print('p-value: %f' % result[1])
  return result[1] > significance_level


## 1.1 make stationary

# Step 2. White noise 検定(残差分析も同じ）

If a time series is white noise, it is a sequence of random numbers and cannot be predicted. If the series of forecast errors are not white noise, it suggests improvements could be made to the predictive model.
*   Is the mean/level non-zero?
*   Does the mean/level change over time?
*   Does the variance change over time?
*   Do values correlate with lag values?


In [None]:
def is_white_noise(ts):
  '''
  ２．WNS検定
  input : time seires
  output : dict of result
  '''
  print(ts.describe())






In [None]:
class TimeSeriesAnalyzer:
  def __init__(self, data, date_column, value_column, freq='10T'):
    self.data = data
    self.date_column = date_column
    self.value_column = value_column
    self.freq = freq
    self.processed_data = None

  def is_datetime_column(self):
    '''
    detect the datetime column

    '''
    if self.date_column in self.data.columns:
      return pd.api.types.is_datetime64_any_dtype(self.data[self.date_column])
    else:
      raise ValueError(f' column {self.date_column} is not found in data')

  def make_datetime(self):
    '''
    if  not datetime type,  convert to datetime type.
    input: data[date_column]
    output: data[date_column
    '''
    if not self.is_datetime_column():
      self.data[self.date_column] = pd.to_datetime(self.data[self.date_column])
      return self.data[self.date_column]
    else:
      return self.data[self.date_column]

  def is_time_ordered(self):
    '''
    detect the time order
    input: data[date_column]
    output: bool
    '''
    if self.is_datetime_column():
      return self.data[self.date_column].is_monotonic_increasing
    else:
      return self.make_datetime().is_monotonic_increasing

  def set_date_index(self):
    if self.is_time_ordered():
      self.data.set_index(self.date_column, inplace=True)
      return self.data
    else:
      self.data.sort_values(by=self.date_column, inplace=True)
      self.data.set_index(self.date_column, inplace=True)
      return self.data

  #hundling missing value
  def check_missing_values(self, fill_method=None):
    '''
    detect the missing value and handle them

    input:fill_method
    output:boolean value
    '''
    missing_values = self.data[self.value_column].isnull().sum()

    if missing_values > 0:
      print(f'{missing_values} missing data found')

      if fill_method == 'ffill':
        self.data[self.value_column].fillna(method='ffill', inplace=True)
      elif fill_method == 'bfill':
        self.data[self.value_column].fillna(method='bill', inplace=True)
      elif fill_method =='mean':
        self.fill_with_mean_of_neighbor()
      elif fill_method=='drop':
        self.data.dropna(subset=[self.value_column], inplace=True)
      elif fill_method =='linear':
        self.data[self.value_column].interpolate(method='linear', inplace=True)
      elif fill_method == 'time':
        self.data[self.value_column].interpolate(method='time', inplace=True)

      return True
    else:
      print('No missing data found')
      return False

  def fill_with_mean_of_neighbor(self):
    '''
    fill the missing value with mean of neighbor
    '''
    for i in range(1, len(self.data) - 1):
      if pd.isnull(self.data.loc[i,self.value_column]):
        prev_value = self.data.loc[i-1, self.value_column]
        next_value =self.data.loc[i+1, self.value_column]
        if pd.notnull(prev_value) and pd.notnull(next_value):
          self.data.loc[i,self.value_column] = (prev_value + next_value)/2

  #constructing a full range time series
  def is_full_range_time_series(self):
    '''
    check the full range time series and construct a full range time series
    '''
    if not isinstance(self.data.index, pd.DatetimeIndex):
      self.set_date_index()

    full_range = pd.date_range(start=self.data.index.min(), end=self.data.index.max(), freq=self.freq)

    is_full_range = self.data.index.equals(full_range)

    if is_full_range:
      print('The time series is a full range time series')
      self.processed_data = self.data.copy()
    else:
      print('The time series is not a full range time series')
      self.processed_data = self.data.reindex(full_range)
      self.processed_data.check_missing_values(fill_method)
      self.processed_data.sort_index(inplace=True)

    return self.processed_data

  def drop_minute_value(self, minutes_to_drop=[15, 45]):
    '''
    drop the unnessary minute value
    '''
    if not isinstance(self.data.index, pd.DatetimeIndex):
      self.set_date_index()

    self.data = self.data[~self.data.index.minute.isin(minutes_to_drop)]
    return self.data







