# Time Series Transformations

Taking inspiration from the great notebook that [@pavelvod](https://www.kaggle.com/pavelvod) shared on [AMEX EDA: Revealing time patterns of features](https://www.kaggle.com/code/pavelvod/amex-eda-revealing-time-patterns-of-features/notebook).

Aiming to continue the data analysis to review time series features from the data.

### Time series decompose
We are able to decompose the time series to understand four key components. 
1. Level: The average value in the series.
2. Trend: The increasing or decreasing value in the series.
3. Seasonality: The repeating short-term cycle in the series.
4. Noise: The random variation in the series.

With this notebook we will aim to review each of these to understand the temporal nature of the features.

### Stationarity
Aiming to assess if the data points show stationary or non-stationary characteristics

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import gc

# Decompose time series
from statsmodels.tsa.seasonal import seasonal_decompose

# Stationarity
from statsmodels.tsa.stattools import adfuller, kpss

import warnings
warnings.filterwarnings("ignore")

In [None]:
train_agg = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet').assign(S_2=lambda dx: pd.to_datetime(dx.S_2)).groupby('S_2').mean()
end_of_train = pd.to_datetime(train_agg.index).max()

In [None]:
# Set number of features to review
N = 47
test_agg = []
for cols2use in train_agg.columns.values.reshape(-1, N):
    test_agg.append(pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet', columns=cols2use.tolist() + ['S_2']).assign(S_2=lambda dx: pd.to_datetime(dx.S_2)).groupby('S_2').mean())


test_agg = pd.concat(test_agg, axis=1)

In [None]:
agg_data = pd.concat([train_agg, test_agg])
agg_data.shape

In [None]:
# Review a small list of features to understand the outputs
for first_letter in list(set([col.split('_')[0] for col in agg_data.columns])):
    print(first_letter)
    for feature in agg_data.columns[:10]:
        if feature[0] != first_letter:
            continue
        print(feature)

In [None]:
# Plot Partial Autocorrelation and Autocorrelation graphs
for first_letter in list(set([col.split('_')[0] for col in agg_data.columns])):
    for feature_name in agg_data.columns[:10]:
        if feature_name[0] != first_letter:
            continue
        s = agg_data.loc[:, feature_name]
        print(feature_name) # for Ctrl + F
        fig = plt.figure(figsize=(16, 6))
        sub_pacf = fig.add_subplot(2,2,4)
        sub_acf = fig.add_subplot(2,2,3) 
        mn = fig.add_subplot(2,2,(1,2)) 
        max_lags = 150
        plot_pacf(s, lags=max_lags, ax=sub_acf)
        plot_acf(s, lags=max_lags, ax=sub_pacf)
        s.plot(color='green', ax=mn)
        mn.axvline(end_of_train, color='red', linestyle='--')
        mn.set_title(feature_name)
        plt.subplots_adjust(wspace= 0.25, hspace= 0.25)
        plt.show()

In [None]:
# Decompose the time series
for first_letter in list(set([col.split('_')[0] for col in agg_data.columns])):
    for feature_name in agg_data.columns[:10]:
        if feature_name[0] != first_letter:
            continue
        s = agg_data.loc[:, feature_name]
        print(feature_name) # for Ctrl + F
        fig = plt.figure(figsize=(16, 6))
#         ax_add = fig.add_subplot(2,2,4)
#         ax_mult = fig.add_subplot(2,2,3) 
        mn = fig.add_subplot(2,2,(1,2)) 
        s.plot(color='green', ax=mn)
        mn.axvline(end_of_train, color='red', linestyle='--')
        mn.set_title(feature_name)
        
        res_add = seasonal_decompose(s, extrapolate_trend='freq').plot().suptitle('Additive')
        res_mult = seasonal_decompose(s, extrapolate_trend='freq', model='multiplicative').plot().suptitle('Multiplicative')
                
        plt.subplots_adjust(wspace= 0.25, hspace= 0.25)
        plt.show()

### Stationarity

In [None]:
# Test feature
test_feature = 'P_2'

# ADF Test
result = adfuller(agg_data.loc[:, test_feature], autolag='AIC')
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')

if result[1] > 0.05:
    print('Series is not Stationary')
else:
    print('Series is Stationary')


# KPSS Test
stats, p, lags, critical_values = kpss(agg_data.loc[:, test_feature])
print(f'KPSS Test Statistics: {stats}')
print(f'p-value: {p}')

if p<0.05:
      print('Series is not Stationary')
else:
      print('Series is Stationary')

In [None]:
# Create the class to be used on all features
class Stationarity:
    '''
    Performs a stationarity review of each of the columns that are contained within the DataFrame being reviewed.
    df: transformed macro-economic variables for review
    '''
    
    # Constructor
    #def __init__(self):
        
        
    # Method - Create UDF for the basic transformations
    def stationarity_test(self, df) -> pd.DataFrame:
        # Construct empty DataFrame with required output columns
        tmp = pd.DataFrame(columns=['variable', 'obs', 'adfstat',
                                    'adfpvalue', 'kpssstat', 'kpsspvalue',
                                    'adf_stat', 'kpss_stat'
                                   ])
        for col in df.columns:
            # Keep only the not null values for review - method requires a series input
            result = adfuller(df.loc[df[col].notnull(),col], autolag='AIC')
            stat = result[0]
            pval = result[1]

            # KPSS test. 
            # The option 'ct' for the regression parameter, means that the 'deterministic trend' is reviewed instead of the mean value 
            stats, p, _, _ = kpss(df.loc[df[col].notnull(),col], nlags='auto')

            # Populate the tmp DataFrame
            tmp = tmp.append({'variable': col
                              ,'obs': len(df.loc[df[col].notnull(),col])
                              ,'adfstat': stat
                              ,'adfpvalue': pval
                              ,'kpssstat': stats
                              ,'kpsspvalue': p
                              ,'adf_stat': str(np.where(pval > 0.05, 
                                                    'Non-Stationary',
                                                    'Stationary'))
                              ,'kpss_stat': str(np.where(p < 0.05,
                                                    'Non-Stationary',
                                                    'Stationary'))
                             }
                            ,ignore_index=True)
        return tmp

In [None]:
# Review the stationarity
stat = Stationarity()
stat_review = stat.stationarity_test(agg_data)
stat_review.head()

In [None]:
# Understand the quantum of stationarity features
stat_review.groupby(['adf_stat','kpss_stat'])['variable'].count()

Many of the features are displaying non-stationary characteristics. Will have to review variable transformations to understand if after removing the time series element the features become stationary.

### Data transformations

In [None]:
from scipy.stats import norm

class Transformations:
    '''
    Examples of using the methods
    ---
    Applying basic transformations:
    df_ind1 = df_ind.copy()
    utl.transformations().basic_transformations(df_ind1, 1)
    # Drop initial columns after transformation
    orig_cols = list(df_ind.columns)
    df_ind1.drop(columns=orig_cols, inplace=True)
    ---
    Applying lags to the basic transformations:
    df_ind2 = df_ind1.copy()
    utl.transformations().create_lags(df_ind2)
    # Drop original columns from the final transformed list
    orig_cols = list(df_ind1.columns)
    df_ind2.drop(columns=orig_cols, inplace=True)
    ---
    Test for missing and/or infinity value's for a variable after the transformations
    df_miss_inf = utl.transformations().missing_infinity_values(df_ind2)
    '''  
        
    # Method - Create UDF for the basic transformations
    def basic_transformations(self, df, addTrans=None):
        """
        Basic Transformations
        > - _R Raw series
        > - _Y y/y% changes
        > - _Q q/q% changes
        > - _D 1st difference
        > - _S 4th difference (seasonal difference)
        Additional Transformations (if required)
        > - _L logit transformation
        > - _X 1st difference of logit
        ***
        - ToDo
        > - _M MA smoothing and then y/y% changes
        > - _G 1st difference of y/y% changes
        > - _J 1st difference of q/q% changes
        > - _B value at time t / value at time 2 years previous
        > - _P probit transformation
        > - _Z 1st difference of probit (only if probit requested)
        """
        for col in df.columns:
            # Raw Series
            df[col+'_R'] = df[col]
            # y/y% changes
            df[col+'_Y'] = df[col].pct_change(4)
            # q/q% changes
            df[col+'_Q'] = df[col].pct_change()
            # 1st difference
            df[col+'_D'] = df[col].diff()
            # 4th difference
            df[col+'_S'] = df[col].diff(4)
            # Completing the additional transformations
            if addTrans == 1:
                #pass
                # Logit transformations
                df[col+'_L'] = np.log(df[col] / (1 - df[col]))
                df[col+'_X'] = df[col+'_L'].diff()
                # Probit transformations
                df[col+'_P'] = norm.ppf(df[col])
                df[col+'_Z'] = df[col+'_P'].diff()
        return df
    
    # Method - Create UDF for the creation of lags
    def create_lags(self, df, maxLag=4):
        for col in df.columns:
            for lag in np.arange(0, maxLag + 1):
                df[col+'_L'+str(lag)] = df[col].shift(lag)
        return df
    
    # Method - Test for the number of missing values in a column transformation
    def missing_infinity_values(self, df):
        df_out = pd.DataFrame(columns=['Column', 'MissingVals', 'InfinityVals'])
        for col in df.columns:
            # Check for missing values
            df_out = df_out.append(
                {"Column" : col,
                 "MissingVals" : df[col].isnull().sum(),
                 "InfinityVals" : np.isinf(df[col]).values.sum()
                }
            ,ignore_index=True)
        return df_out

In [None]:
# Applying basic transformations
trans = Transformations()
agg_data1 = agg_data.copy()
trans.basic_transformations(agg_data1)
# Drop initial columns after transformation
orig_cols = list(agg_data.columns)
agg_data1.drop(columns=orig_cols, inplace=True)

In [None]:
agg_data1.head()

In [None]:
# Applying lags to the basic transformations:
agg_data2 = agg_data1.copy()
trans.create_lags(agg_data2)
# Drop original columns from the final transformed list
orig_cols = list(agg_data1.columns)
agg_data2.drop(columns=orig_cols, inplace=True)

In [None]:
agg_data2.head()

In [None]:
# For each feature there are 25 variations. Selecting first 10
feature_list = [col for col in agg_data2.columns[:250]]
agg_data3 = agg_data2[feature_list]
agg_data3.head()

In [None]:
# Perform the stationarity assessment
# Review the stationarity for first 10 features
stat_review1 = stat.stationarity_test(agg_data3)
stat_review1.head()

In [None]:
# Understand the quantum of stationarity features
stat_review.groupby(['adf_stat','kpss_stat'])['variable'].count()

### Correlation review
Aiming to understand how the different transformations impact the feature correlations.

Taking the feature 'P_2_R_L0' as the dependent variable to highlight how the code works. In future should use the dependent variable.

In [None]:
class Correlation:
    '''
    Perform a correlation analysis between the independent and dependent variables.
    :df_ind = (DataFrame) final list of independent variable transformations
    :df_dep = (DataFrame) final dependent variable transformation
    :dep = (str) string value of the dependent variable name

    Application of the code
    # Run the correlation analysis
    df_ind_ = df_ind3[df_indep_list] # 
    df_dep_ = df_dep.loc[:,['ODR_X']] # the final dependent variable transformation
    dep = 'ODR_X'
    # Run the Correlation class to create the correlation of independent variables with the dependent variable
    df_corr = stmd.Correlation(df_ind_, df_dep_, dep).main() # stmd represents the package reference alias
    df_corr # displays a pivot table of the correlation values

    sns.heatmap(df_corr, annot=True); # produces a heatmap of the pivot table    
    '''
    
    # Constructor
    def __init__(self, df_ind, df_dep, dep):
        self.df_ind = df_ind
        self.df_dep = df_dep
        self.dep = dep

    # Method - create the merged DataFrame of dependent and independent variables
    def _merge_data(self):
        df = pd.merge(self.df_dep
                      ,self.df_ind
                      ,how="left"
                      ,left_index=True
                      ,right_index=True
                     )
        return df

    # Method - create the correlation DataFrame
    def _correlation_test(self, df):
        # Create correlations
        df1 = df.corr()[self.dep][:]
        df1 = df1.to_frame()
        # Adjust DataFrame
        df1.reset_index(inplace=True)
        df1 = df1.rename(columns = {'index':'var_trans', self.dep:'Dep_corr'})
        return df1

    # Method - add the required group by columns
    def _groupby(self, df):
        # Add the required group by columns
        corr = df.var_trans.str.split('_', expand = True)
        # Add variable values back to the dataframe
        df['variable'] = corr[0] + '_' + corr[1]
        df['trans'] = corr[2]
        df['lag'] = corr[3]
        return df

    # Method - create the pivot table summary
    def _pivot(self, df):
        # Create a pivot table to display the range of correlation values for each variable by lag and transformation
        df_pivot = pd.pivot_table(df,
                                  values="Dep_corr",
                                  index=["variable", "lag"],
                                  columns=["trans"]
                                 )
        return df_pivot

    # Method - run the methods from above
    def main(self):
        df = self._merge_data()
        df1 = self._correlation_test(df)
        df1 = self._groupby(df1)
        df1 = self._pivot(df1)
        return df1

    # Run the process steps
#     if __name__ == "__main__":
#         main()

In [None]:
# Run the correlation analysis
dep = 'P_2_R_L0'
df_ind_ = agg_data3.loc[:,agg_data3.columns.drop(dep)] 
df_dep_ = agg_data3.loc[:,[dep]]

# Run the Correlation class to create the correlation of independent variables with the dependent variable
corr = Correlation(df_ind_, df_dep_, dep)
df_corr = corr.main() # stmd represents the package reference alias
df_corr # displays a pivot table of the correlation values

In [None]:
# produces a heatmap of the pivot table
plt.figure(figsize=(20,15))
sns.heatmap(df_corr, annot=True);

Many of this group of features are showing negative correlations with the variable 'P_2_R_L0'. It is interesting to see that the Raw versions are showing some of the biggest correlations. 