<a href="https://colab.research.google.com/github/rato42/linear_regression/blob/unstable-yeah-sure/CLASSBASED_new_linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [94]:
import sys

# Detect Colab environment
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    from google.colab import drive, auth
    from googleapiclient.discovery import build
    from googleapiclient.http import MediaIoBaseDownload

    auth.authenticate_user()
    drive.mount("/content/drive")
else:
    from oauth2client.service_account import ServiceAccountCredentials

    print("Running outside Colab (VS Code/local)")

try:
    import gspread
except ModuleNotFoundError:
    if IN_COLAB and "google.colab" in str(get_ipython()):
        %pip install gspread
    import gspread

from gspread_dataframe import set_with_dataframe

Running outside Colab (VS Code/local)


In [95]:

import os

import time

import math
import scipy.stats as stats
import pandas as pd
import numpy as np

import pwlf

from scipy.stats import linregress, ttest_ind, ttest_rel, ttest_1samp
import matplotlib.pyplot as plt
import re
from matplotlib.font_manager import FontProperties
import statsmodels.formula.api as smf
import statsmodels.api as sm

from datetime import datetime, timedelta

from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tools.tools import add_constant
from statsmodels.tsa.arima.model import ARIMA

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer

from dateutil.relativedelta import relativedelta

from sklearn import set_config
set_config(display="diagram")

import logging
from typing import Dict, Any

import io

# LOAD --- data from gsheet if colab or local


In [96]:
dados_brutos = None
output_wksheet = None

input_path = "https://docs.google.com/spreadsheets/d/1m3eEaxWT4Unb8jBZWKjiLivavfA0x3PT1F1Rz1eXwVE/edit?gid=0"
output_path = "https://docs.google.com/spreadsheets/d/1RzC3DfKNUwYA-qfUN0i53hUwRkWGMBOCoYQqdqG4MQY/edit?gid=0"

local_input_path = (
    #"D:\\CodeStuff\\Stats\\colab_linear_regression\\linear_regression\\input\\input.csv" ### poa
    "D:\\CodeStuff\\Stats\\colab_linear_regression\\linear_regression\\input\\input.xlsx" ### rs
)

if IN_COLAB:
    import gspread
    from google.auth import default

    auth.authenticate_user()
    creds, _ = default()
    gc = gspread.authorize(creds)
    workbook = gc.open_by_url(input_path)
    worksheets = workbook.worksheets()
    output_wksheet = gc.open_by_url(output_path)
    dados_brutos = worksheets[0].get_all_values()
else:
    try:
        dados_brutos = pd.read_csv(local_input_path, header=None).values.tolist()
    except:
        dados_brutos = pd.read_excel(local_input_path, header=None).values.tolist()

# ARGS --- Define Args


fit_kwargs = {
    "cov_type": "HAC",
    "cov_kwds": {"maxlags": 6}  # Account for ~6 month lagged effects
}

✓ Autocorrelation from policy inertia
✓ Seasonal patterns in mental health data
✓ Lagged treatment effects

In [None]:
INDEPENDENT_VAR = "periodo"  ### Y
DEPENDENT_VAR = "valores"  ### X
DATE_NAME = "date"
VARIABLE_COLUMN_NAME = "variavel"
LOCAL_OUTPUT_PATH = "D:\\CodeStuff\\Stats\\colab_linear_regression\\linear_regression\\output\\output.xlsx"



######### DUMMIES ##########

INCLUDE_MONTH_DUMMIES = True

### as quedas do primeiro lockdown nao são tão claras e a recuperação e queda parecem ser mais graduais
CHOQUE_INICIAL_START_DATE= "2020-01-01"
CHOQUE_INICIAL_END_DATE = "2020-07-01"#"2020-08-01" #month based 

### abril e maio tiveram quedas bem identificaveis nos atendimentos em todas variavieis
SECONDLOCKDOWN_START_DATE = "2021-04-01"
SECONDLOCKDOWN_END_DATE = '2021-05-01' #month based  #'2021-04-01'#quarter_based #

POS_PANDEMIA_START_DATE = "2020-08-01"#"2021-02-01"#"2021-06-01" ### talvez eu tenha q excluir o segundo lockdown dessa dummy
END_DATE = "2024-12-01"#"2024-12-01"  ### data final para o modelo, se for maior que a data do dataframe, o modelo vai dar erro

POS_PANDEMIA_EXCLUDE_RANGES = [(SECONDLOCKDOWN_START_DATE, SECONDLOCKDOWN_END_DATE)]




DUMMY_ARGS =        {
        'Step_PosPandemia': {"start_date": POS_PANDEMIA_START_DATE, "end_date": END_DATE,'exclude_ranges': POS_PANDEMIA_EXCLUDE_RANGES, 'plot':False, 'step': True,},
        'Choque_Inicial': {"start_date": CHOQUE_INICIAL_START_DATE, "end_date": CHOQUE_INICIAL_END_DATE},
        'Lockdown_2021': {"start_date": SECONDLOCKDOWN_START_DATE, "end_date": SECONDLOCKDOWN_END_DATE},
        'Trend_PosPandemia' : {"start_date": POS_PANDEMIA_START_DATE, "end_date": END_DATE,'exclude_ranges': POS_PANDEMIA_EXCLUDE_RANGES, 'plot': True, 'add_time_trend': True, 'keep_period_index': False, 'step': False}, 
        }

##############################


PREPROCESSOR_ARGS = {
    'DateRange': [2016, 2024],  
    'FrequencyDateGroup' : 'M',
    'DummyArgs' : DUMMY_ARGS,
}


###### PROCESSING ######
MODEL_FORMULA = DEPENDENT_VAR + "~" + INDEPENDENT_VAR 

if INCLUDE_MONTH_DUMMIES:
    month_dummies = []
    for i in range(1, 12):
        month_dummies.append("month_" + str(i+1))
    for month in month_dummies:
        MODEL_FORMULA += " + " + month

if DUMMY_ARGS:
    for key in DUMMY_ARGS.keys():
            MODEL_FORMULA += " + " + key


PROCESSOR_ARGS = {
    "formula": MODEL_FORMULA,#"valores ~ periodo",  # Example formula
    "model": smf.glsar,#smf.ols,  # Use OLS as the regression model for testing
    "find_best_rho": True,  # Disable best rho search for simplicity
    "find_best_rho_args": {
        "start": -1.0,
        "stop": 1.0,
        "step": 0.01,
        "criterio": "aic"
    },
    "rho": None,  # No rho value for OLS
    "cov_type": "HAC",  # Use nonrobust covariance for testing
    "cov_kwargs": {'maxlags': 4},  # No additional covariance arguments
    "iterative_fit": True,  # Enable iterative fitting if needed
    "max_iterations": 100,  # Set maximum iterations for iterative fitting
}





# Define Preprocessors


In [98]:
def BuildDataDF(data):
    string_dates = data[0][1:]  # Store original string dates

    # Converter as strings de datas em objetos datetime
    #date_list = [datetime.strptime(date, "%d/%m/%Y") for date in string_dates]
    date_list = [
    date if isinstance(date, datetime) else datetime.strptime(date, "%d/%m/%Y")
    for date in string_dates
    ]

    # Extrair os nomes das variáveis
    variavel = [row[0] for row in data[1:]]

    # Extrair os dados (sem agrupamento)
    dados = [row[1:] for row in data[1:]]
    dados = np.array(dados, dtype=int)  # Converter para tipo object para evitar warning

    # Criar o DataFrame with DatetimeIndex
    df = pd.DataFrame(data=dados, index=variavel, columns=pd.DatetimeIndex(date_list))

    return df

class DateFilterTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self, periodos_inclusivos=None, periodos_exclusivos=None, date_col=DATE_NAME
    ):
        self.periodos_inclusivos = periodos_inclusivos
        self.periodos_exclusivos = periodos_exclusivos
        self.date_col = date_col

    def get_date_range(self, Y1, Y2=None, M1=None, M2=None, D1=None, D2=None):
        Y2 = Y2 or Y1
        M1 = M1 or 1
        M2 = M2 or 12
        D1 = D1 or 1
        D2 = D2 or 31
        start_date = datetime(Y1, M1, D1)
        end_date = datetime(Y2, M2, D2)
        return pd.date_range(start=start_date, end=end_date, freq="D")

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Convert date column if it's not datetime yet
        if not pd.api.types.is_datetime64_any_dtype(X[self.date_col]):
            X[self.date_col] = pd.to_datetime(X[self.date_col], errors="coerce")

        if self.periodos_inclusivos:
            valid_dates = self.get_date_range(*self.periodos_inclusivos)
            return X[X[self.date_col].isin(valid_dates)].reset_index(drop=True)

        elif self.periodos_exclusivos:
            invalid_dates = self.get_date_range(*self.periodos_exclusivos)
            return X[~X[self.date_col].isin(invalid_dates)].reset_index(drop=True)

        return X

class MeltTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        var_name=VARIABLE_COLUMN_NAME,
        value_name=DEPENDENT_VAR,
        date_name=DATE_NAME,
        y_name=INDEPENDENT_VAR,
    ):
        self.var_name = var_name
        self.value_name = value_name
        self.date_name = date_name
        self.y_name = y_name

    def fit(self, X, y=None):
        # Nothing to learn, so just return self
        return self

    def transform(self, X):
        # Transpose and reset index
        df_long = X.T.reset_index()

        # Rename index to date_name
        df_long = df_long.rename(columns={"index": self.date_name})

        # Melt the DataFrame
        df_long = pd.melt(
            df_long,
            id_vars=[self.date_name],
            var_name=self.var_name,
            value_name=self.value_name,
        )

        # Sort by category and date
        df_final = df_long.sort_values([self.var_name, self.date_name]).reset_index(
            drop=True
        )

        return df_final

class PeriodIndexAdder(BaseEstimator, TransformerMixin):
    def __init__(self, group_col=VARIABLE_COLUMN_NAME, new_col=INDEPENDENT_VAR):
        self.group_col = group_col
        self.new_col = new_col
        self.period_date_mapping = {}  # Dictionary to store period-date equivalences

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.new_col] = X.groupby(self.group_col).cumcount() + 1

        #Store the mapping of periods to dates
        self.period_date_mapping = dict(zip(X[DATE_NAME], X[self.new_col]))
        #for group, group_data in X.groupby(self.group_col):
        #    self.period_date_mapping[group] = dict(zip(group_data[DATE_NAME], group_data[self.new_col]))

        return X

class DataFrameSorter(BaseEstimator, TransformerMixin):
    """
    Sorts a DataFrame by a specified column.

    Parameters:
    -----------
    sort_by : str
        Name of the column to sort by.
    ascending : bool, default=True
        Whether to sort in ascending order.
    """

    def __init__(self, sort_by, ascending=True):
        self.sort_by = sort_by
        self.ascending = ascending

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if self.sort_by not in X.columns:
            raise ValueError(f"Column '{self.sort_by}' not found in DataFrame.")
        return X.sort_values(by=self.sort_by, ascending=self.ascending).reset_index(drop=True)

class MonthlyDummyAdder(BaseEstimator, TransformerMixin):
    """
    Adds dummy variables for each month to control for seasonality, leaving December as the baseline.

    Parameters:
    -----------
    date_column : str
        Name of the column containing dates (must be datetime).
    """

    def __init__(self, date_column=DATE_NAME):
        self.date_column = date_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.date_column] = pd.to_datetime(X[self.date_column])  # Ensure dates are in datetime format

        # Extract month from the date column
        X['month'] = X[self.date_column].dt.month

        # Create dummy variables for each month, excluding December as the baseline
        month_dummies = pd.get_dummies(X['month'], prefix="month", drop_first=True).astype(int)



        # Concatenate the dummy variables with the original DataFrame
        X = pd.concat([X, month_dummies], axis=1)

        # Drop the temporary 'month' column
        X.drop(columns=['month'], inplace=True)

        return X
    

class MultiDummyAdder(BaseEstimator, TransformerMixin):
	"""
	Adds multiple dummy variables based on date ranges or custom conditions.
	Allows each dummy to be binary or a local time trend via its own config.

	Parameters:
	-----------
	dummy_config : dict
		Dictionary defining dummy variables. Example:
		{
			'Pandemia': {
				'start_date': '2020-01-01',
				'end_date': '2020-12-31',
				'add_time_trend': True,
				'exclude_ranges': [('2020-04-01', '2020-05-01')]  # Optional
			}
		}
	date_column : str
		Name of the column containing dates (must be datetime).
	"""

	def __init__(self, dummy_config, date_column=DATE_NAME):
		self.dummy_config = dummy_config
		self.date_column = date_column

	def fit(self, X, y=None):
		return self

	def transform(self, X):
		X = X.copy()
		X[self.date_column] = pd.to_datetime(X[self.date_column])
		X = X.sort_values(self.date_column).reset_index(drop=True)

		for dummy_name, config in self.dummy_config.items():
			start_date = pd.to_datetime(config.get("start_date"))
			end_date = pd.to_datetime(config.get("end_date"))
			use_trend = config.get("add_time_trend", False)
			keep_index = config.get("keep_period_index", False)
			exclude_ranges = config.get("exclude_ranges", [])

			if not start_date:
				raise ValueError(f"Missing 'start_date' for dummy '{dummy_name}'.")

			if end_date is not None:
				mask = (X[self.date_column] >= start_date) & (X[self.date_column] <= end_date)
			else:
				mask = X[self.date_column] >= start_date

			# Excluir ranges dentro do período
			for excl_start, excl_end in exclude_ranges:
				excl_start = pd.to_datetime(excl_start)
				excl_end = pd.to_datetime(excl_end)
				mask &= ~((X[self.date_column] >= excl_start) & (X[self.date_column] <= excl_end))

			if use_trend:
				if keep_index:
					# estilo artigo: período global multiplicado pela dummy
					X[dummy_name] = 0
					X.loc[mask, dummy_name] = X.loc[mask, INDEPENDENT_VAR]
				else:
					start_periodo = X.loc[mask, INDEPENDENT_VAR].min()
					X[dummy_name] = 0
					X.loc[mask, dummy_name] = (X.loc[mask, INDEPENDENT_VAR] - start_periodo + 1).clip(lower=0)
			else:
				X[dummy_name] = mask.astype(int)

		return X


class FrequencyGroupingTransformer(BaseEstimator, TransformerMixin):
    """
    ## only datetime supported groupings
    Transformer for grouping data by standard datetime frequencies.

    This transformer takes a DataFrame with datetime columns and groups
    the data by a specified frequency (e.g., 'Q' for quarterly, 'M' for monthly).

    ARGS:
        frequency (str): The frequency to group by. Defaults to 'Q' (quarterly).
                        Supported frequencies are those accepted by pd.Grouper (e.g., 'Q', 'M', 'Y').
    """

    def __init__(self, frequency="Q"):  # Default to quarterly grouping
        self.frequency = frequency

    def fit(self, X, y=None):
        """
        This transformer does not require fitting.

        ARGS:
            X (pd.DataFrame): The input DataFrame.
            y (Any, optional): Ignored. Defaults to None.

        Returns:
            self: Returns the transformer instance.
        """
        return self

    def transform(self, X):
        """
        Groups the data by the specified frequency and sums the values.

        ARGS:
            X (pd.DataFrame): The input DataFrame with datetime columns.

        Returns:
            pd.DataFrame: The grouped DataFrame with summed values.
                          Columns are datetime objects representing the end of each group.
        """
        if self.frequency == None or self.frequency == 'M':
            return X
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input must be a pandas DataFrame.")



        # Convert columns to datetime if they are not already
        if not isinstance(X.columns[0], pd.Timestamp):
            X.columns = pd.to_datetime(X.columns, errors="coerce")

        # Group by the specified frequency using pd.Grouper
        grouped_df = X.T.groupby(pd.Grouper(freq=self.frequency)).sum().T

        return grouped_df



# Instantiate Pipe and get RegressionDF

In [99]:
pipe = Pipeline(
    [
        ("grouping", FrequencyGroupingTransformer(frequency=PREPROCESSOR_ARGS['FrequencyDateGroup'])),
        ("melt", MeltTransformer()),
        ("date_filter", DateFilterTransformer(periodos_inclusivos=PREPROCESSOR_ARGS['DateRange'])),
        ("period_index_adder", PeriodIndexAdder()),
        ("dummy_adder", MultiDummyAdder(dummy_config=PREPROCESSOR_ARGS['DummyArgs'])),
        ("monthly_dummy_adder", MonthlyDummyAdder()),
        ("sorter", DataFrameSorter(sort_by=VARIABLE_COLUMN_NAME, ascending=True)),
    ]
)

RawDataDF = BuildDataDF(dados_brutos)
RegressionDF = pipe.fit_transform(RawDataDF)
RegressionDF

Unnamed: 0,date,variavel,valores,periodo,Step_PosPandemia,Choque_Inicial,Lockdown_2021,Trend_PosPandemia,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,2016-01-01,"F00-F09 - Transtornos mentais orgânicos, inclu...",109,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2018-03-01,"F00-F09 - Transtornos mentais orgânicos, inclu...",242,27,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,2018-04-01,"F00-F09 - Transtornos mentais orgânicos, inclu...",223,28,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,2022-04-01,"F00-F09 - Transtornos mentais orgânicos, inclu...",294,76,1,0,0,21,0,0,1,0,0,0,0,0,0,0,0
4,2024-01-01,"F00-F09 - Transtornos mentais orgânicos, inclu...",451,97,1,0,0,42,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291,2017-08-01,__Atendimentos na APS (exceto saúde mental),122593,20,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1292,2023-02-01,__Atendimentos na APS (exceto saúde mental),180244,86,1,0,0,31,1,0,0,0,0,0,0,0,0,0,0
1293,2017-07-01,__Atendimentos na APS (exceto saúde mental),114118,19,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1294,2022-10-01,__Atendimentos na APS (exceto saúde mental),224278,82,1,0,0,27,0,0,0,0,0,0,0,0,1,0,0


# Get Period Date Map and Dummies Period Indexes

In [100]:

PERIOD_DATE_MAP = pipe.named_steps['period_index_adder'].period_date_mapping
PERIOD_DATE_MAP

{Timestamp('2016-01-01 00:00:00'): 1,
 Timestamp('2016-02-01 00:00:00'): 2,
 Timestamp('2016-03-01 00:00:00'): 3,
 Timestamp('2016-04-01 00:00:00'): 4,
 Timestamp('2016-05-01 00:00:00'): 5,
 Timestamp('2016-06-01 00:00:00'): 6,
 Timestamp('2016-07-01 00:00:00'): 7,
 Timestamp('2016-08-01 00:00:00'): 8,
 Timestamp('2016-09-01 00:00:00'): 9,
 Timestamp('2016-10-01 00:00:00'): 10,
 Timestamp('2016-11-01 00:00:00'): 11,
 Timestamp('2016-12-01 00:00:00'): 12,
 Timestamp('2017-01-01 00:00:00'): 13,
 Timestamp('2017-02-01 00:00:00'): 14,
 Timestamp('2017-03-01 00:00:00'): 15,
 Timestamp('2017-04-01 00:00:00'): 16,
 Timestamp('2017-05-01 00:00:00'): 17,
 Timestamp('2017-06-01 00:00:00'): 18,
 Timestamp('2017-07-01 00:00:00'): 19,
 Timestamp('2017-08-01 00:00:00'): 20,
 Timestamp('2017-09-01 00:00:00'): 21,
 Timestamp('2017-10-01 00:00:00'): 22,
 Timestamp('2017-11-01 00:00:00'): 23,
 Timestamp('2017-12-01 00:00:00'): 24,
 Timestamp('2018-01-01 00:00:00'): 25,
 Timestamp('2018-02-01 00:00:00'):

In [101]:
def find_idx_from_date(target_date, args, period_date_map=PERIOD_DATE_MAP):
    """
    Finds the 'periodo' (index) corresponding to a given datetime in the PERIOD_DATE_MAP.

    Args:
        target_date (str or datetime): The target date to search for (e.g., '2020-01-01').
        period_date_map (dict): The PERIOD_DATE_MAP containing period-to-date mappings.

    Returns:
        int: The 'periodo' (index) corresponding to the target date, or None if not found.
    """
    # Ensure the target_date is a datetime object and normalize to quarter-end
    if isinstance(target_date, str):
        target_date = pd.to_datetime(target_date)
    if args['FrequencyDateGroup'] == 'Q' or args['FrequencyDateGroup'] == 'QE':
        target_date = target_date + pd.offsets.QuarterEnd(0)

    # Iterate through the PERIOD_DATE_MAP to find the matching period
    for date, period in period_date_map.items():
        if isinstance(date, str):
            date = pd.to_datetime(date)
        if args['FrequencyDateGroup'] == 'Q' or args['FrequencyDateGroup'] == 'QE':
            date = date + pd.offsets.QuarterEnd(0) # Normalize to quarter-end
        if date == target_date:
            return period - 1

    return None


def find_date_from_idx(target_idx, args, period_date_map=PERIOD_DATE_MAP):
    """
    Finds the date corresponding to a given 'periodo' (index) in the PERIOD_DATE_MAP.

    Args:
        target_idx (int): The target index to search for (e.g., 1, 2, 3).
        period_date_map (dict): The PERIOD_DATE_MAP containing period-to-date mappings.

    Returns:
        datetime or None: The date corresponding to the target index, or None if not found.
    """
    # Iterate through the PERIOD_DATE_MAP to find the matching index
    for date, period in period_date_map.items():
        if period == target_idx:
            date = pd.to_datetime(date) if isinstance(date, str) else date
            date = date + pd.offsets.QuarterEnd(0) if args['FrequencyDateGroup'] == 'Q' or args['FrequencyDateGroup'] == 'QE' else date
            return date
    return None

In [102]:
DummyIndexes = {}

# Iterate over DUMMY_ARGS to calculate and store indexes
for dummy_name, config in DUMMY_ARGS.items():
    start_date = config.get("start_date")
    end_date = config.get("end_date")
    
    print(dummy_name, config)
    # Calculate indexes using find_idx_from_date
    start_idx = find_idx_from_date(start_date, PREPROCESSOR_ARGS, PERIOD_DATE_MAP) if start_date else None
    end_idx = find_idx_from_date(end_date, PREPROCESSOR_ARGS, PERIOD_DATE_MAP) if end_date else None
    
    # Store the indexes in the table
    DummyIndexes[dummy_name] = {
        "start_index": start_idx,
        "end_index": end_idx
    }
DummyIndexes



Step_PosPandemia {'start_date': '2020-08-01', 'end_date': '2024-12-01', 'exclude_ranges': [('2021-04-01', '2021-05-01')], 'plot': False, 'step': True}
Choque_Inicial {'start_date': '2020-01-01', 'end_date': '2020-07-01'}
Lockdown_2021 {'start_date': '2021-04-01', 'end_date': '2021-05-01'}
Trend_PosPandemia {'start_date': '2020-08-01', 'end_date': '2024-12-01', 'exclude_ranges': [('2021-04-01', '2021-05-01')], 'plot': True, 'add_time_trend': True, 'keep_period_index': False, 'step': False}


{'Step_PosPandemia': {'start_index': 55, 'end_index': 107},
 'Choque_Inicial': {'start_index': 48, 'end_index': 54},
 'Lockdown_2021': {'start_index': 63, 'end_index': 64},
 'Trend_PosPandemia': {'start_index': 55, 'end_index': 107}}

# EXPLORATION 


In [103]:
main_regrdf = RegressionDF
main_regrdf[VARIABLE_COLUMN_NAME]
main_regrdf

Unnamed: 0,date,variavel,valores,periodo,Step_PosPandemia,Choque_Inicial,Lockdown_2021,Trend_PosPandemia,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,2016-01-01,"F00-F09 - Transtornos mentais orgânicos, inclu...",109,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2018-03-01,"F00-F09 - Transtornos mentais orgânicos, inclu...",242,27,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,2018-04-01,"F00-F09 - Transtornos mentais orgânicos, inclu...",223,28,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,2022-04-01,"F00-F09 - Transtornos mentais orgânicos, inclu...",294,76,1,0,0,21,0,0,1,0,0,0,0,0,0,0,0
4,2024-01-01,"F00-F09 - Transtornos mentais orgânicos, inclu...",451,97,1,0,0,42,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291,2017-08-01,__Atendimentos na APS (exceto saúde mental),122593,20,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1292,2023-02-01,__Atendimentos na APS (exceto saúde mental),180244,86,1,0,0,31,1,0,0,0,0,0,0,0,0,0,0
1293,2017-07-01,__Atendimentos na APS (exceto saúde mental),114118,19,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1294,2022-10-01,__Atendimentos na APS (exceto saúde mental),224278,82,1,0,0,27,0,0,0,0,0,0,0,0,1,0,0


In [104]:
from statsmodels.api import OLS
from statsmodels.tools.tools import add_constant
import pandas as pd
from scipy.stats import f

def chow_test(data, split_point, dependent_var, independent_vars):
    """
    Realiza o teste de Chow para verificar mudanças estruturais em um modelo de regressão.

    Args:
        data (pd.DataFrame): DataFrame contendo os dados.
        split_point (int): Ponto de divisão para o teste de Chow.
        dependent_var (str): Nome da variável dependente.
        independent_vars (list): Lista de nomes das variáveis independentes.

    Returns:
        dict: Resultados do teste de Chow contendo os valores F e p.
    """
    # Dividir os dados em dois subconjuntos
    data1 = data[data['periodo'] <= split_point]
    data2 = data[data['periodo'] > split_point]

    # Ajustar os modelos para cada subconjunto
    X1 = add_constant(data1[independent_vars])
    y1 = data1[dependent_var]
    model1 = OLS(y1, X1).fit()

    X2 = add_constant(data2[independent_vars])
    y2 = data2[dependent_var]
    model2 = OLS(y2, X2).fit()

    # Ajustar o modelo para o conjunto completo
    X_full = add_constant(data[independent_vars])
    y_full = data[dependent_var]
    model_full = OLS(y_full, X_full).fit()

    # Calcular os graus de liberdade
    k = len(independent_vars) + 1  # Número de parâmetros (incluindo o intercepto)
    n1 = len(data1)
    n2 = len(data2)
    n = n1 + n2

    # Calcular a soma dos quadrados dos resíduos
    RSS1 = model1.ssr
    RSS2 = model2.ssr
    RSS_full = model_full.ssr

    # Calcular o valor F
    F = ((RSS_full - (RSS1 + RSS2)) / k) / ((RSS1 + RSS2) / (n - 2 * k))

    # Calcular o p-valor
    p_value = 1 - f.cdf(F, k, n - 2 * k)

    return {"F-statistic": F, "p-value": p_value}


chow_results = {}
split_point = find_idx_from_date(POS_PANDEMIA_START_DATE, PREPROCESSOR_ARGS)  # Exemplo: ponto de divisão para o teste de Chow
dependent_var = DEPENDENT_VAR
independent_vars = [INDEPENDENT_VAR] #+ [key for key in DUMMY_ARGS.keys()]
for variable in main_regrdf[VARIABLE_COLUMN_NAME].unique():
    variable_data = main_regrdf[main_regrdf[VARIABLE_COLUMN_NAME] == variable]
    result = chow_test(variable_data, split_point, dependent_var, independent_vars)
    chow_results[variable] = result
# Converter os resultados em um DataFrame para visualização
chow_results_df = pd.DataFrame(chow_results).T
chow_results_df.columns = ["F-statistic", "p-value"]
chow_results_df['p-value'] = chow_results_df['p-value'].round(8)
print(chow_results_df)

                                                    F-statistic   p-value
F00-F09 - Transtornos mentais orgânicos, inclus...    51.122786  0.000000
F20-F29 - Esquizofrenia, transtornos esquizotíp...   111.216436  0.000000
F30-F39 - Transtornos do humor [afetivos]             69.039246  0.000000
F40-F48 - Transtornos neuróticos, transtornos r...    84.810738  0.000000
F50-F59 - Síndromes comportamentais associadas ...     8.291017  0.000456
F60-F69 - Transtornos da personalidade e do com...     9.679091  0.000140
F70-F79 - Retardo mental                              70.336844  0.000000
F80-F89 - Transtornos do desenvolvimento psicol...   128.272485  0.000000
F90-F98 - Transtornos do comportamento e transt...    60.021728  0.000000
F99-F99 - Transtorno mental não especificado          71.515596  0.000000
_Grand Total                                          89.300706  0.000000
__Atendimentos na APS (exceto saúde mental)           15.201854  0.000002


In [105]:
### TODO: make the knots per variable
### TODO: remove the bumps/step_dummies before testing

pwmodel = pwlf.PiecewiseLinFit(main_regrdf[INDEPENDENT_VAR].values,main_regrdf[DEPENDENT_VAR].values)
knots = pwmodel.fit(2)
print(knots)
for knot in knots:
    print(find_date_from_idx(np.round(knot), PREPROCESSOR_ARGS, PERIOD_DATE_MAP))

[  1.          51.48309284 108.        ]
2016-01-01 00:00:00
2020-03-01 00:00:00
2024-12-01 00:00:00


In [106]:
import pandas as pd
import numpy as np
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_ljungbox
from statsmodels.stats.stattools import durbin_watson
from scipy.stats import shapiro, normaltest, jarque_bera
import statsmodels.api as sm

def analyze_time_series(df: pd.DataFrame, variable_col: str, date_col: str, value_col: str):
    """
    Analyzes a melted DataFrame for heteroscedasticity, distribution, and autocorrelation.

    Parameters:
        df (pd.DataFrame): Melted DataFrame with columns for variable, date, and value.
        variable_col (str): Column name for the variable identifier.
        date_col (str): Column name for the date.
        value_col (str): Column name for the value.

    Returns:
        dict: A dictionary with analysis results for each variable.
    """
    results = {}

    for variable in df[variable_col].unique():
        # Filter data for the current variable
        variable_data = df[df[variable_col] == variable].sort_values(by=date_col)
        y = variable_data[value_col].values
        x = np.arange(len(y))  # Time index as independent variable

        # Add constant for regression
        X = sm.add_constant(x)

        # Fit OLS model
        model = sm.OLS(y, X).fit()

        # Heteroscedasticity tests
        bp_test = het_breuschpagan(model.resid, X)
        bp_pvalue = bp_test[1]

        # Normality tests
        shapiro_test = shapiro(model.resid)
        shapiro_pvalue = shapiro_test.pvalue

        jb_test = jarque_bera(model.resid)
        jb_pvalue = jb_test[1]

        dagostino_test = normaltest(model.resid)
        dagostino_pvalue = dagostino_test.pvalue

        # Autocorrelation tests
        dw_stat = durbin_watson(model.resid)
        ljungbox_test = acorr_ljungbox(model.resid, lags=[10], return_df=True)
        ljungbox_pvalue = ljungbox_test['lb_pvalue'].iloc[0]

        # Store results
        results[variable] = {
            "Heteroscedasticity (Breusch-Pagan)": bp_pvalue,
            "Normality (Shapiro-Wilk)": shapiro_pvalue,
            "Normality (Jarque-Bera)": jb_pvalue,
            "Normality (D'Agostino)": dagostino_pvalue,
            "Autocorrelation (Durbin-Watson)": dw_stat,
            "Autocorrelation (Ljung-Box)": ljungbox_pvalue,
        }

    return results

explorationresults = analyze_time_series(main_regrdf, VARIABLE_COLUMN_NAME, DATE_NAME, DEPENDENT_VAR)
explorationresultsDF = pd.DataFrame(explorationresults).T
explorationresultsDF

Unnamed: 0,Heteroscedasticity (Breusch-Pagan),Normality (Shapiro-Wilk),Normality (Jarque-Bera),Normality (D'Agostino),Autocorrelation (Durbin-Watson),Autocorrelation (Ljung-Box)
"F00-F09 - Transtornos mentais orgânicos, inclusive os sintomáticos",0.147427,0.032837,0.003714996,0.01745659,0.758334,2.1322499999999998e-36
"F20-F29 - Esquizofrenia, transtornos esquizotípicos e transtornos delirantes",0.011537,0.638054,0.8718033,0.7248578,0.354672,3.238254e-92
F30-F39 - Transtornos do humor [afetivos],0.320375,0.9373,0.716458,0.5634862,0.422068,1.791495e-64
"F40-F48 - Transtornos neuróticos, transtornos relacionados com o ""stress"" e transtornos somatoformes",0.048686,0.361423,0.2291813,0.1889178,0.525509,1.600449e-72
F50-F59 - Síndromes comportamentais associadas a disfunções fisiológicas e a fatores físicos,0.932917,0.069847,0.004147643,0.006234204,1.184699,1.049508e-05
F60-F69 - Transtornos da personalidade e do comportamento do adulto,0.035,0.003424,0.01939544,0.01624442,1.045593,1.779468e-07
F70-F79 - Retardo mental,0.014254,0.552957,0.6662058,0.6693367,0.493265,2.5695560000000003e-69
F80-F89 - Transtornos do desenvolvimento psicológico,1.7e-05,0.224135,0.3713773,0.3017818,0.427556,1.76029e-86
F90-F98 - Transtornos do comportamento e transtornos emocionais que aparecem habitualmente durante a infância ou a adolescência,0.000477,0.207121,0.07153962,0.07390851,0.506893,1.210529e-45
F99-F99 - Transtorno mental não especificado,0.005847,2e-06,1.513149e-06,2.313217e-05,0.41791,2.363009e-62


# Define Processors

In [107]:
class RegressionProcessor:
    def __init__(
        self,
        data,
        formula,
        model=None,
        find_best_rho=False,
        find_best_rho_args={
            'start': -1.0,
            "stop": 1.0,
            'step': 0.05,
            'criterio': 'aic',
        },
        rho=None,
        cov_type=None,
        cov_kwargs=None,
        iterative_fit=False,  # New argument to enable iterative fitting
        max_iterations=100,  # Maximum iterations for iterative fitting
    ):
        self.data = data
        self.formula = formula
        self.model = model
        self.rho = rho
        self.cov_type = cov_type
        self.cov_kwargs = cov_kwargs
        self.find_best_rho = find_best_rho
        self.find_best_rho_args = find_best_rho_args
        self.iterative_fit = iterative_fit
        self.max_iterations = max_iterations
        self.results = None

    def fit(self, rho=None):
        # Fit the regression model using the specified formula and data
        if self.find_best_rho:
            best_rho = 1.0
            best_criterion_value = np.inf
            model = None
            results = None
            for rho in np.arange(
                self.find_best_rho_args['start'],
                self.find_best_rho_args['stop'],
                self.find_best_rho_args['step']
            ):
                test_model = self.model(self.formula, data=self.data, rho=rho)
                if self.iterative_fit and hasattr(test_model, "iterative_fit"):
                    test_results = test_model.iterative_fit(
                        maxiter=self.max_iterations,
                        cov_type=self.cov_type,
                        cov_kwds=self.cov_kwargs,
                    )
                else:
                    test_results = test_model.fit(
                        cov_type=self.cov_type,
                        cov_kwds=self.cov_kwargs,
                    )
                criterion_value = getattr(test_results, self.find_best_rho_args['criterio'])
                if criterion_value < best_criterion_value:
                    best_criterion_value = criterion_value
                    best_rho = rho
                    model = test_model
                    results = test_results
            self.rho = best_rho
            self.results = results
        else:
            if self.rho is not None:
                model = self.model(self.formula, data=self.data, rho=rho)
            else:
                model = self.model(self.formula, data=self.data)

            # Use iterative fitting if enabled and supported
            if self.iterative_fit and hasattr(model, "iterative_fit"):
                self.results = model.iterative_fit(
                    maxiter=self.max_iterations,
                    cov_type=self.cov_type,
                    cov_kwds=self.cov_kwargs,
                )
            else:
                self.results = model.fit(
                    cov_type=self.cov_type,
                    cov_kwds=self.cov_kwargs,
                )

In [108]:
class VariableProcessor:
    def __init__(
        self,
        df,       
        processor_args, 
        processor,

    ):
        self.df = df
        self.results = {}
        self.processor_args = processor_args
        self.processor = processor
        
    def process_variables(self):
        # Process each variable in the DataFrame
        for variable in self.df[VARIABLE_COLUMN_NAME].unique():
            variable_data = self.df[self.df[VARIABLE_COLUMN_NAME] == variable]
            processor = self.processor(data=variable_data, **self.processor_args)
            processor.fit()
            self.results[variable] = processor.results

        return self.results

# Instantiate Processor and Process/Export

In [109]:
processor_args = PROCESSOR_ARGS
variable_processor = VariableProcessor(
    df = RegressionDF,
    processor_args = processor_args,
    processor = RegressionProcessor
)

variable_processor.process_variables()
results = variable_processor.results
for variable in results:
    print(f"Results for {variable}:")
    result = results[variable]
    print(result.summary())
    
    #print("AIC", result.aic,)  # Akaike Information Criterion
    #print("BIC", result.bic,)  # Bayesian Information Criterion

    print("\n")

Results for F00-F09 - Transtornos mentais orgânicos, inclusive os sintomáticos:
                           GLSAR Regression Results                           
Dep. Variable:                valores   R-squared:                       0.901
Model:                          GLSAR   Adj. R-squared:                  0.883
Method:                 Least Squares   F-statistic:                     114.7
Date:                seg, 21 abr 2025   Prob (F-statistic):           1.34e-52
Time:                        18:37:03   Log-Likelihood:                -539.60
No. Observations:                 107   AIC:                             1113.
Df Residuals:                      90   BIC:                             1159.
Df Model:                          16                                         
Covariance Type:                  HAC                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------

# Teste Lags

In [None]:
#teste_lag
from collections import defaultdict
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import statsmodels.stats.api as sms

lags_to_test = [1, 2, 3, 4, 5, 6, 8, 10]
estatisticas_por_lag = defaultdict(dict)

# Adicionar uma variável para armazenar a informação de ACF
acf_values = defaultdict(dict)

for lag in lags_to_test:
    processor_args["cov_kwargs"] = {"maxlags": lag}
    
    variable_processor = VariableProcessor(
        df=RegressionDF,
        processor_args=processor_args,
        processor=RegressionProcessor
    )
    
    variable_processor.process_variables()
    results = variable_processor.results
    
    for var, result in results.items():
        # extrair estatísticas desejadas
        summary_frame = result.summary2().tables[1]  # tabelinha de coeficientes
        if "Trend_PosPandemia" in summary_frame.index:
            trend_stats = summary_frame.loc["Trend_PosPandemia"]
            estatisticas_por_lag[lag][var] = {
                "coef": trend_stats["Coef."],
                "std_err": trend_stats["Std.Err."],
                "p_value": trend_stats["P>|z|"] if "P>|z|" in trend_stats else trend_stats["P>|t|"],
            }
        else:
            estatisticas_por_lag[lag][var] = {"coef": None, "std_err": None, "p_value": None}
        
        # Outras estatísticas: Durbin-Watson, AIC, BIC, R², F-Estatístico, p-valor do modelo
        estatisticas_por_lag[lag][var]["durbin_watson"] = sm.stats.stattools.durbin_watson(result.resid)
        estatisticas_por_lag[lag][var]["aic"] = result.aic
        estatisticas_por_lag[lag][var]["bic"] = result.bic
        estatisticas_por_lag[lag][var]["r_squared"] = result.rsquared
        estatisticas_por_lag[lag][var]["f_statistic"] = result.fvalue
        estatisticas_por_lag[lag][var]["p_value_model"] = result.f_pvalue
        
        # ACF (Autocorrelation Function) dos resíduos
        acf_result = sm.tsa.acf(result.resid, nlags=lag, fft=True)
        acf_values[lag][var] = acf_result
        
        # Exibir os valores de ACF para verificação
        print(f"Valores da ACF para {var} no lag {lag}: {acf_result}")
        
        # Ajuste: Registra os lags com autocorrelação positiva maior que 0.1 ou qualquer critério ajustado
        acf_significant_lags = [i for i, acf_val in enumerate(acf_result) if abs(acf_val) > 0.1]  # Limite mais relaxado
        estatisticas_por_lag[lag][var]["acf_significant_lags"] = acf_significant_lags

# Converte o dicionário em uma lista de registros
registros = []
for lag, variaveis in estatisticas_por_lag.items():
    for var, stats in variaveis.items():
        linha = {"lag": lag, "variavel": var}
        linha.update(stats)
        registros.append(linha)

# Cria o DataFrame
df_resultados_lags = pd.DataFrame(registros)

# Ordena para facilitar a visualização
df_resultados_lags.sort_values(by=["variavel", "lag"], inplace=True)
#df_resultados_lags.to_csv("df_resultados_lags.csv", sep=';', decimal=',', index=False, encoding='utf-8-sig')
# Exibe os resultados



# Define Hover Graph Func
