In [117]:
from sklearn.pipeline import Pipeline 
from sklearn.base import RegressorMixin

import pandas as pd
import numpy as np
from typing import Dict, List

from prophet import Prophet

from vacances_scolaires_france import SchoolHolidayDates

import warnings
warnings.simplefilter("ignore")

In [45]:
class DataframeFunctionTransformer():
    """
    Generic class to create custom estimator/transformer
    """
    def __init__(self, func, **kwargs):
        self.func = func
        self.parameters = kwargs

    def transform(self, input_df):
        return self.func(input_df, **self.parameters)

    def fit(self, X, y=None, **fit_params):
        return self

In [15]:
def load_gz_csv_to_df(filepath: str) -> pd.DataFrame:
    try:
        return pd.read_csv(
            filepath, 
            compression='gzip', 
            header=0, sep=',', 
            error_bad_lines=False
        )
    except FileNotFoundError:
        exit(1)

bu_df = load_gz_csv_to_df('data/bu_feat.csv.gz')
train_df = load_gz_csv_to_df('data/train.csv.gz')
test_df = load_gz_csv_to_df('data/train.csv.gz')

In [125]:
def merge_two_df(df1: pd.DataFrame, df2: pd.DataFrame, on: str) -> pd.DataFrame:
    return df1.merge(df2, on=on)

def remove_features(df: pd.DataFrame, features: List[str]) -> pd.DataFrame:
    return df.drop(columns=features, axis=1)

def cast_series_to_datetime(df: pd.DataFrame, column_name: str, date_format: str = "%Y-%m-%d") -> pd.DataFrame:
    df[column_name] = pd.to_datetime(df[column_name], format=date_format)
    return df

def create_year_and_week_column(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    df["year"] = df[column_name].dt.year
    df["week_number"] = df[column_name].dt.week
    return df

def sum_turnover_by_week_bu_dpt(
    df: pd.DataFrame, 
    features: List[str] = ["but_num_business_unit", "dpt_num_department", "year", "week_number"], 
    column_to_sum: str = "turnover"
) -> pd.DataFrame:
    
    return (
        df
        .groupby(features, as_index=False)
        .agg({column_to_sum: "sum"})
    )

def create_id_bu_dpt(df: pd.DataFrame) -> pd.DataFrame:
    df["but_bu_dpt_id"] = (
        df["but_num_business_unit"].astype(str) 
        + "-" 
        + df["dpt_num_department"].astype(str)
    )
    return df

def create_id_year_week_number(df: pd.DataFrame) -> pd.DataFrame:
    df["ds"] = (
        df["year"].astype(str) 
        + "-" 
        + df["week_number"].astype(str)
    )
    return df

def create_date_from_year_and_week_number(df: pd.DataFrame) -> pd.DataFrame:
    df["ds"] = (
        pd.to_datetime(
            df["ds"] + "-1", 
            format="%Y-%W-%w"
        )
    )
    return df

def rename_taget_column(df: pd.DataFrame, target_col_to_rename: str) -> pd.DataFrame:
    return df.rename(columns={target_col_to_rename: "y"})

def create_store_dpt_df(df: pd.DataFrame) -> List[pd.DataFrame]:
    return [df[df["but_bu_dpt_id"] == i] for i in df["but_bu_dpt_id"].unique()]

def select_columns_to_fit(dfs: List[pd.DataFrame], columns_to_fit: List[str] = ["ds", "y", "but_bu_dpt_id"]) -> List[pd.DataFrame]:
    return [df[columns_to_fit] for df in dfs]


def remove_n_last_values(dfs: List[pd.DataFrame], n_value_to_remove: int = 8) -> List[pd.DataFrame]:
    return [df.drop(df.tail(n_value_to_remove).index) for df in dfs]


def create_df_for_modeling(df: pd.DataFrame) -> List[pd.DataFrame]:
    df = sum_turnover_by_week_bu_dpt(df)
    df = create_id_bu_dpt(df)
    df = create_id_year_week_number(df)
    df = create_date_from_year_and_week_number(df)
    df = rename_taget_column(df, target_col_to_rename="turnover")
    
    df_list = create_store_dpt_df(df)
    df_list = remove_n_last_values(df_list)  
    df_list = select_columns_to_fit(df_list)
    
    return df_list

In [150]:
class ProphetEstimator(BaseEstimator):
    def __init__(self, holidays_df):
        super().__init__()
        self.holidays_df = holidays_df
        self.model = Prophet(
            holidays=holidays_df,
            holidays_prior_scale=1
        )
        self.models = {}

    def fit(self, dfs, Y=None):
        models = {}
        for df in dfs:
            print(df.columns)
            df = df.reset_index(drop=True)
            model = Prophet(
                holidays=holidays_df,
                holidays_prior_scale=1
            )
            model.fit(df[["ds", "y"]])
            key = df.loc[0]["but_bu_dpt_id"]
            self.models[key] = model
        return self

    def predict(self, X = None):
        predictions = {}
        for key in self.models.keys():
            future_df = self.models.get(key).make_future_dataframe(periods=8, freq='W')
            predictions[key] = self.models.get(key).predict(future_df)
        return predictions

In [153]:
def create_holidays_df(start_year: int, end_year: int) -> pd.DataFrame:
    years = np.arange(start_year, end_year +1)
    d = SchoolHolidayDates()
    holidays_list = []
    for holiday_year in years:
        holidays_list += list(d.holidays_for_year(holiday_year).keys())
    df = pd.DataFrame(holidays_list, columns=["ds"])
    df["holiday_name"] = "fr_holiday"
    df.rename(columns={"holiday_name": "holiday"}, inplace=True)
    return df
holidays_df = create_holidays_df(2012, 2017)

In [154]:
processing_pipeline = Pipeline(steps=[
    ('create_df', DataframeFunctionTransformer(merge_two_df, df2=bu_df, on="but_num_business_unit")),
    ('remove_correlated_features', DataframeFunctionTransformer(remove_features, features=["but_postcode", "zod_idr_zone_dgr"])),
    ('cast_str_to_datetime', DataframeFunctionTransformer(cast_series_to_datetime, column_name="day_id")),
    ('create_year_and_week_number', DataframeFunctionTransformer(create_year_and_week_column, column_name="day_id")),
    ('create_df_for_modeling', DataframeFunctionTransformer(create_df_for_modeling)),
    ('model', ProphetEstimator(holidays_df))
])

In [171]:
trainf_df_two_bu_dpt = train_df[((train_df.but_num_business_unit == 64)|(train_df.but_num_business_unit == 119))&(train_df.dpt_num_department == 127)]

In [172]:
models = processing_pipeline.fit(trainf_df_two_bu_dpt)

INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


Index(['ds', 'y', 'but_bu_dpt_id'], dtype='object')


INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


Initial log joint probability = -4.75784
Index(['ds', 'y', 'but_bu_dpt_id'], dtype='object')
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       469.425     0.0236322       105.341           1           1      122   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     142       470.851    0.00139628       72.3066   1.452e-05       0.001      212  LS failed, Hessian reset 
     191       470.935   4.34553e-05       81.6026   6.441e-07       0.001      323  LS failed, Hessian reset 
     199       470.938   7.50992e-06       56.6305      0.2067           1      333   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     219       470.938   8.82067e-06       59.4994    1.13e-07       0.001      395  LS failed, Hessian reset 
     248       470.939   1.59277e-08       57.7983      0.3611           1      437   
Optimization terminated normally: 
  

In [173]:
predictions = models.predict(X=trainf_df_two_bu_dpt)

In [178]:
predictions.get("64-127").tail(8)

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,fr_holiday,...,holidays,holidays_lower,holidays_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
240,2017-08-06,927.635201,1398.84634,2085.702961,927.635201,927.635201,797.106182,797.106182,797.106182,-80.217818,...,-80.217818,-80.217818,-80.217818,877.324,877.324,877.324,0.0,0.0,0.0,1724.741383
241,2017-08-13,913.236509,1157.028248,1855.228214,913.236509,913.236509,595.110199,595.110199,595.110199,-80.217818,...,-80.217818,-80.217818,-80.217818,675.328017,675.328017,675.328017,0.0,0.0,0.0,1508.346708
242,2017-08-20,898.837817,788.660558,1509.985165,898.597485,899.126036,258.785874,258.785874,258.785874,-80.217818,...,-80.217818,-80.217818,-80.217818,339.003692,339.003692,339.003692,0.0,0.0,0.0,1157.623691
243,2017-08-27,884.439124,388.997389,1098.451319,883.169164,885.709944,-123.649726,-123.649726,-123.649726,-80.217818,...,-80.217818,-80.217818,-80.217818,-43.431909,-43.431909,-43.431909,0.0,0.0,0.0,760.789398
244,2017-09-03,870.040432,129.40503,805.988392,867.543474,872.311477,-396.9999,-396.9999,-396.9999,-80.217818,...,-80.217818,-80.217818,-80.217818,-316.782082,-316.782082,-316.782082,0.0,0.0,0.0,473.040533
245,2017-09-10,855.64174,95.139077,824.120547,851.421809,859.362859,-403.455253,-403.455253,-403.455253,0.0,...,0.0,0.0,0.0,-403.455253,-403.455253,-403.455253,0.0,0.0,0.0,452.186487
246,2017-09-17,841.243048,161.347943,823.660715,835.626748,846.906148,-370.24318,-370.24318,-370.24318,0.0,...,0.0,0.0,0.0,-370.24318,-370.24318,-370.24318,0.0,0.0,0.0,470.999868
247,2017-09-24,826.844356,112.416179,827.577038,819.209291,834.443603,-341.882187,-341.882187,-341.882187,0.0,...,0.0,0.0,0.0,-341.882187,-341.882187,-341.882187,0.0,0.0,0.0,484.962169


In [165]:
test_df_two_bu_dpt = test_df[(test_df.but_num_business_unit == 64)&(test_df.dpt_num_department == 127)]

In [179]:
test_df_two_bu_dpt.reset_index(drop=True).head(8).sort_values(by="day_id")

Unnamed: 0,day_id,but_num_business_unit,dpt_num_department,turnover
7,2017-08-12,64,127,1858.334691
6,2017-08-19,64,127,1088.346132
5,2017-08-26,64,127,756.882482
4,2017-09-02,64,127,836.789133
3,2017-09-09,64,127,814.82336
2,2017-09-16,64,127,915.977585
1,2017-09-23,64,127,666.932164
0,2017-09-30,64,127,580.308443
