In [1]:
import pyodbc
import pandas as pd
import datetime as dt
import os
import pickle

In [3]:
# Path to the pickle file
file_path = 'E:\\My Drive\\Colab Notebooks\\cb_forecasting\\df_pickle.pkl'

# with open(file_path, 'rb') as file:
#     df_pickled = pickle.load(file)

# Load the DataFrame from the pickle file using a with clause
def load_dataframe(file_path):
    try:
        with open(file_path, 'rb') as file:
            df_pickled = pd.read_pickle(file)
    except FileNotFoundError:
        raise FileNotFoundError(f"The file at '{file_path}' does not exist. Please check the file path and try again.")
    except pickle.UnpicklingError:
        raise pickle.UnpicklingError(f"The file at '{file_path}' could not be unpickled. The file may be corrupted or incompatible.")
    except ValueError:
        raise ValueError(f"The file at '{file_path}' uses an unsupported pickle protocol. "
                         "Ensure that your Python version supports this protocol, or recreate the file with a lower protocol version.")
    except Exception as e:
        raise Exception(f"An unexpected error occurred while loading the pickle file: {e}")
    return df_pickled

df_pickled = load_dataframe(file_path)

ValueError: The file at 'E:\My Drive\Colab Notebooks\cb_forecasting\df_pickle.pkl' uses an unsupported pickle protocol. Ensure that your Python version supports this protocol, or recreate the file with a lower protocol version.

In [None]:
# Convert 'ds' to datetime if it's not already
df_pickled['ds'] = pd.to_datetime(df_pickled['ds'])

# Determine the date two weeks ago from the latest date in your data
latest_date = df_pickled['ds'].max()
two_weeks_ago = latest_date - pd.Timedelta(weeks=2)

# Filter the DataFrame for the last two weeks
df_last_two_weeks = df_pickled[df_pickled['ds'] >= two_weeks_ago]

# Group by date and count records
daily_counts = df_last_two_weeks.groupby('ds').size().reset_index(name='record_count')

# Display the result
print(daily_counts)

In [None]:
daily_counts.record_count.mean()

In [None]:
# Get current year and month
current_year = dt.datetime.now().year
current_month = dt.datetime.now().month

# Format for previous month
previous_year, previous_month = (current_year - int(current_month == 1), 12 if current_month == 1 else current_month - 1)

# Format periods as 'yyyymm'
current_period = f"{current_year}{current_month:02d}"
previous_period = f"{previous_year}{previous_month:02d}"

# Remove data for these periods
df_pickled = df_pickled[~df_pickled['period'].isin([current_period, previous_period])]

current_period, previous_period

In [None]:
query_saldet = f'''
SELECT
    sd.period
    ,CASE
        WHEN DATEPART(WEEKDAY, sd.TRX_DATE) = 1 THEN DATEADD(DAY, -2, sd.TRX_DATE) -- Adjusting Sunday to Friday
        WHEN DATEPART(WEEKDAY, sd.TRX_DATE) = 7 THEN DATEADD(DAY, -1, sd.TRX_DATE) -- Adjusting Saturday to Friday
        ELSE sd.TRX_DATE
    END AS [ds]
    ,CASE
        WHEN LEFT(i.PUBLISHING_GROUP,3) = 'BAR' THEN 'BAR'
        WHEN i.PUBLISHER_CODE = 'Princeton' THEN 'CPA'
        ELSE i.PUBLISHING_GROUP
    END pgrp
    ,ssr_row.Description ssr
    ,CASE
        WHEN ssr_row.SSRRowID IN('32','146') then 'Consignment'
        WHEN ssr_row.SSRRowID IN('6') then 'Amazon'
        ELSE chan.Description
    END channel
    ,CASE
        WHEN [dbo].[fnFrontBackListCode](i.AMORTIZATION_DATE,sd.TRX_DATE) IN('A','R') THEN 'F'
        ELSE 'B'
    END flbl
    ,SUM(CASE 
            WHEN i.PUBLISHER_CODE = 'Princeton' AND YEAR(sd.TRX_DATE) > 2022 THEN 0 
            ELSE sd.REVENUE_AMOUNT 
        END) AS [y]
FROM
    ebs.Sales sd
    INNER JOIN ssr.SalesSSRRow stie on stie.CUSTOMER_TRX_LINE_ID = sd.CUSTOMER_TRX_LINE_ID
    INNER JOIN ssr.SSRRow ssr_row on ssr_row.SSRRowID= stie.SSRRowID
    INNER JOIN ssr.SubChannel sub on sub.SubChannelID = ssr_row.SubChannelID
    INNER JOIN ssr.Channel chan on chan.ChannelID = sub.ChannelID
    INNER JOIN ebs.Item i ON i.ITEM_ID = sd.ITEM_ID
WHERE
    sd.PERIOD >= ?
    AND sd.INVOICE_LINE_TYPE = 'SALE'
    AND cbq2.dbo.fnSaleTypeCode(SD.AR_TRX_TYPE_ID) = 'N'
    AND i.PRICE_AMOUNT <> 0
    AND i.PUBLISHER_CODE IN('Chronicle','Princeton')
    AND i.PRODUCT_TYPE IN ('BK', 'FT')
GROUP BY
    sd.period
    ,CASE 
        WHEN DATEPART(WEEKDAY, sd.TRX_DATE) = 1 THEN DATEADD(DAY, -2, sd.TRX_DATE) -- Adjusting Sunday to Friday
        WHEN DATEPART(WEEKDAY, sd.TRX_DATE) = 7 THEN DATEADD(DAY, -1, sd.TRX_DATE) -- Adjusting Saturday to Friday
        ELSE sd.TRX_DATE
    END
    ,CASE
        WHEN LEFT(i.PUBLISHING_GROUP,3) = 'BAR' THEN 'BAR'
        WHEN i.PUBLISHER_CODE = 'Princeton' THEN 'CPA'
        ELSE i.PUBLISHING_GROUP
    END
    ,ssr_row.Description
    ,CASE
        WHEN ssr_row.SSRRowID IN('32','146') then 'Consignment'
        WHEN ssr_row.SSRRowID IN('6') then 'Amazon'
        ELSE chan.Description
    END
    ,CASE
        WHEN [dbo].[fnFrontBackListCode](i.AMORTIZATION_DATE,sd.TRX_DATE) IN('A','R') THEN 'F'
        ELSE 'B'
    END
'''

In [None]:
# Can be used to run the SQL query for any period after the parameter 'period'
def query_data(period = previous_period):
    # SQL python connection to our server
    conn = pyodbc.connect('Driver={SQL Server};'
                          'Server=sql-2-db;'
                          'Database=CBQ2;')

    cursor = conn.cursor()
    
    df = pd.read_sql_query(query_saldet
                           ,conn
                           ,dtype={'period':'category'
                                   ,'pgrp':'category'
                                   ,'channel':'category'
                                   ,'ssr':'category'
                                   ,'flbl':'category'
                                   ,'y':'float64'
                                    }
                            ,parse_dates=['ds']
                            ,params=[previous_period]
                            )
    return df

In [None]:
# This is just querying the last two months of data
df_additional = query_data()

In [None]:
# This is the current pickled dataset
df_pickled_rows = df_pickled.shape[0]
print(f'{df_pickled_rows: ,.0f}')

df_pickled.tail()

In [None]:
df_additional_rows = df_additional.shape[0]
print(f'{df_additional_rows: ,.0f}')
df_additional

In [None]:
df_combo = pd.concat([df_pickled,df_additional],ignore_index=True)

In [None]:
df_combo['period'] = df_combo['period'].astype('category')
df_combo['ssr'] = df_combo['ssr'].astype('category')

In [None]:
df_combo

In [None]:
df_combo.info()

In [None]:
def check_combination():
    if df_combo.shape[0] == df_additional.shape[0] + df_pickled.shape[0]:
        print(f'Concatenation Worked!')
        max_date = df_combo.ds.max()
        formatted_date = max_date.strftime('%Y-%m-%d')
        print(f'Last Date: {formatted_date}')
    else:
        print(f'Check that the additional periods were correct added')

In [None]:
check_combination()

In [None]:
df_combo.info()

In [None]:
# Path to the folder where you want to save the pickle file
folder_path = 'E:\\My Drive\\Colab Notebooks\\cb_forecasting\\'

# Check if the folder exists
if not os.path.exists(folder_path):
    print(f"The folder '{folder_path}' does not exist. Please check the path.")
else:
    # Filename for the pickle file
    filename = 'df_pickle.pkl'

    # Full path for saving the file
    full_path = os.path.join(folder_path, filename)

    # Save the DataFrame to a pickle file
    df_combo.to_pickle(full_path)