# IMPORTING DATA 

In [None]:
#!pip install --upgrade pip
!pip install pytorch_forecasting
#0.9.2

import os
import time
import torch
import datetime
import numpy as np
import pandas as pd
from pickle import dump
from pickle import load
import dask.dataframe as dd
import pytorch_lightning as pl
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from category_encoders import MEstimateEncoder
from sklearn.model_selection import train_test_split
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.feature_selection import mutual_info_regression
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters    

# make_mi_scores 
just calculate the mutual information from all the variables in the X dataset with our target y.

In [None]:
def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


# You could use this function...
to see the principal components importance visually

In [None]:
def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    # Set up figure
    fig.set(figwidth=8, dpi=100)
    return axs

# Reading the data from parquet files and first processing
adding the number of time_id values per investment_id
to use as weight on the model or enconder variable, to keep processing

In [None]:
def process_data_1(case):

    files = [os.path.join(dirname,file) for dirname,_,files in os.walk("/kaggle/input") for file in files]

    names = [file for file in files if "investment_ids" in file][:20]

    #num_files = [int(file.split("/")[-1].split(".")[0]) for file in names]

    #dic = {k:v for k,v in zip(num_files,names)}

    #files = [dic[k] for k in range(50) if k in dic.keys()]
    
    if case=="a":
        
        df = pd.concat([pd.read_parquet(file) for file in names 

                            if pd.read_parquet(file).shape[0] >= 500])
    
    else:
        
        df = pd.read_parquet([file for file in files if "train_low_mem" in file][0])
    
    print(df.shape)
    
    #value_counts = df.investment_id.value_counts()
    
    #ids = value_counts[value_counts>500]
    
    #df = df[df.investment_id.isin(ids.index)]
    
    x = pd.DataFrame()
    
    x["time_id_counts_by_investment"] = df.groupby("investment_id").time_id.transform("count")
    
    df = df.join(x)
    
    return df
    

In [None]:

def process_data_2():
    
    train = process_data_1()
    
    print(train)
    
    extra = []

    for id in train.investment_id.unique():

        fourier = CalendarFourier(freq="11m", order=5)

        df_1 = train.loc[train.investment_id==id]

        dp = DeterministicProcess(
        index=df_1.index,additional_terms=[fourier]
        )

        df_det = dp.in_sample()
        
        #pca_2 = PCA(n_components=10)
        
        #X = pca_2.fit_transform(df_det)
        
        #X = pd.DataFrame(columns=[f"sinuoidal_col_{num}" for num in range(1,11)],data=X,index=df_det.index)

        #plot_periodogram(df_1.target)

        #plt.show()

        #df_det.mean(axis=1).plot()

        #df_1.target.plot()

        #plt.show()

        df_1 = df_1.merge(df_det,left_index=True,right_index=True)

        extra.append(df_1)
        
    train = pd.concat(extra)
        
    train["investment_id"] = train["investment_id"].apply(lambda x: str(x))
    
    train["row_id"] = train["row_id"].apply(lambda x: str(x))
    
    train.index = np.arange(1,len(train)+1)
    
    return train

In [None]:
def process_3():
    
    case = "b"
    
    df = process_data_1(case)
    
    if case=="a":
        
        q = df
        
    else:
        
        q = df.iloc[int(2e+5):int(4e+5)]
    
    mi_scores = make_mi_scores(q[[col for col in q.columns if not col in ["row_id","target","investment_id","time_id","time_id_counts_by_investment"]]],q.target)
    
    c_extra = mi_scores.head(50).index.to_list()
    
    c_extra_2 = ["target","investment_id","time_id","time_id_counts_by_investment"]
    
    
    columns = c_extra_2+c_extra
    
    scaler = MinMaxScaler((0,1))
    
    X = scaler.fit_transform(df[c_extra])
    
    pca = PCA(n_components=10)
    
    X = pca.fit_transform(X)
    
    print(pca.explained_variance_ratio_)
    
    plot_variance(pca)
    
    pca_c = [f"col_{a+1}" for a in range(X.shape[1])]
    
    train = pd.DataFrame(columns=pca_c,data=X,index=df.index)
    
    kmean = KMeans(n_clusters=15,n_init=10,algorithm="elkan")
    
    df = df[c_extra_2].join(train)
    
    df["target"] = df.target
    
    df["Cluster"] = kmean.fit_predict(train)
    
    X_cd = kmean.transform(train)
    
    centroid_c = [f"Centroid_{i}" for i in range(X_cd.shape[1])]
    
    X_cd = pd.DataFrame(X_cd,
                        columns=[f"Centroid_{i}" for i in range(X_cd.shape[1])],
                        index=df.index)
    
    df = df.join(X_cd)
    
    df["Cluster"] = df.Cluster.astype("object")
    
    df["investment_id_str"] = df.investment_id.apply(str)
    
    df["time_id_counts_by_investment_str"] = df["time_id_counts_by_investment"].apply(str)
    
    df["time_id"]=df.time_id.apply(int)
    
    return df, mi_scores, pca, scaler, kmean

In [None]:
def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1m")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104,8760,525600/5])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
            "hourly (8760)",
            "every five minutes (105120)"
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax


In [None]:
def substitute_process(df=pd.DataFrame()):
    
    if df.shape[0] == 0:
        
        train = process_data_1("b")
        
    else:
        
        train = df
        
        dicc_time_id = load(open("/kaggle/input/utilities-model/dicc_time_id.pkl","rb"))
        
        train["time_id_counts_by_investment"] = train.investment_id.apply(lambda x: dicc_time_id[x])
    
    if "target" in train:
        
        c_extra_2 = ["target","investment_id","time_id","time_id_counts_by_investment"]
        
    else:
        c_extra_2 = ["investment_id","time_id","time_id_counts_by_investment"]

    mi_scores = load(open("/kaggle/input/utilities-model/mi_scores.pkl","rb"))

    pca = load(open("/kaggle/input/utilities-model/pca.pkl","rb"))

    scaler = load(open("/kaggle/input/utilities-model/MinMaxScaler.pkl","rb"))
    
    kmean = load(open("/kaggle/input/utilities-model/kmean_2.pkl","rb"))
    
    mee = load(open("/kaggle/input/utilities-model/meestimateencoder.pkl","rb"))
        
    scaler_d = scaler.transform(train[mi_scores.head(50).index])

    pca_d = pca.transform(scaler_d)

    pca_c = [f"col_{a+1}" for a in range(pca_d.shape[1])]

    pca_d = pd.DataFrame(data=pca_d,columns=pca_c,index=train.index)
    
    pca_d = train[c_extra_2].join(pca_d)
    
    pca_d["Cluster"] = kmean.predict(pca_d[pca_c].astype(np.float32))
    
    X_cd = kmean.transform(pca_d[pca_c])

    centroid_c = [f"Centroid_{i}" for i in range(X_cd.shape[1])]

    X_cd = pd.DataFrame(data=X_cd,columns=centroid_c,index=train.index)

    pca_d = pca_d.join(X_cd)

    pca_d["investment_id_str"] = pca_d.investment_id.apply(str)

    pca_d["time_id_counts_by_investment_str"] = pca_d["time_id_counts_by_investment"].apply(str)        
    
    pca_d["weight_investment"] = pca_d.investment_id.apply(lambda x: mee[x]) 
    
    return pca_d

In [None]:
train = substitute_process()

In [None]:
"""train, mi_scores, pca, scaler, kmean = process_3()

dump(mi_scores,open("mi_scores.pkl","wb"))

dump(pca,open("pca.pkl","wb"))

dump(train,open("data_processed.pkl","wb"))

dump(scaler,open("MinMaxScaler.pkl","wb"))

dump(kmean,open("kmean.pkl","wb"))

dicc_time_id = {k:v for k,v in train[["investment_id","time_id_counts_by_investment"]].value_counts().index}

dump(dicc_time_id,open("dicc_time_id.pkl","wb"))

encoder = MEstimateEncoder(cols=["investment_id_str"],m=5)

encoder.fit(test[[col for col in test.columns if col not in ["target","weight_investment"]]],test["target"])

train["weight_investment"] = encoder.transform(train[[col for col in test.columns if col not in ["target","weight_investment"]]], train["target"]).investment_id_str

q = train[["weight_investment","investment_id"]].value_counts().index

q = {k:v for k,v in q}

dump(q,open("meestimateencoder.pkl","wb"))"""

In [None]:
train,test = train_test_split(train,test_size=0.3)

In [None]:
columns = [col for col in train.columns if "f_" in col or "Cen" in col]

cosines = [col for col in train.columns if "cos" in col or "sin" in col]

In [None]:
train["time_id"] = train.time_id.astype("int")

train["Cluster"] = train.Cluster.astype("str")

In [None]:
training = TimeSeriesDataSet(
    train,
    time_idx = "time_id",
    target="target",
    group_ids = ["investment_id_str","Cluster"],
    min_encoder_length = 0,
    max_encoder_length = 20,
    min_prediction_length= 1,
    #lags= {"target":[a for a in range(1,5)]},
    max_prediction_length = 50,
    static_categoricals = ["investment_id_str","Cluster"],
    static_reals = ["weight_investment"],
    weight="time_id_counts_by_investment",
    time_varying_known_reals = ["time_id"]+columns,
    time_varying_unknown_reals = ["target"],
    #target_normalizer= GroupNormalizer(groups=["Cluster","investment_id_str"],transformation="softplus"),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True
    )

In [None]:
validation = TimeSeriesDataSet.from_dataset(training,train,
                                            predict=True,
                                            stop_randomization=True
                                           )

batch_size = 1500  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

In [None]:
pl.seed_everything(42)
trainer = pl.Trainer(
    #gpus = 1,
    #tpu_cores=8,
    max_epochs=1,
    sync_batchnorm=True,
    #auto_scale_batch_size="binsearch",
    #limit_train_batches=500,
    # clipping gradients is a hyperparameter and important to prevent divergance
    # of the gradient for recurrent neural networks
    gradient_clip_val=0.1,
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    #n_trials=100,
    learning_rate=0.000213756,
    #batch_size = 256,
    lstm_layers=1,
    hidden_size=32,  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
    attention_head_size=5,
    dropout=0.1,  # between 0.1 and 0.3 are good values
    hidden_continuous_size=16,  # set to <= hidden_size
    output_size=9,  # 7 quantiles by default
    loss=QuantileLoss([0.001,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99]),
    # reduce learning rate if no improvement in validation loss after x epochs
    #reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")



In [None]:
tft.summarize()

In [None]:
tft.load_state_dict(torch.load("/kaggle/input/utilities-model/preliminar_model.pth",map_location=torch.device("cpu")))

In [None]:
"""res = trainer.tuner.lr_find(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=1,
    min_lr=0.001,
)

print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()"""

In [None]:
"""import gc
gc.collect(generation=0)
torch.cuda.empty_cache()"""

In [None]:
"""import tensorflow as tf
import tensorboard as tb

tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

trainer.fit(
        tft,
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader,
    )

torch.save(tft.state_dict(),"preliminar_model.pth")"""

In [None]:
"""
testing = TimeSeriesDataSet(
    test,
    time_idx = "time_id",
    target="target",
    group_ids = ["investment_id_str","Cluster"],
    min_encoder_length = 0,
    max_encoder_length = 20,
    min_prediction_length= 1,
    #lags= {"target":[a for a in range(1,5)]},
    max_prediction_length = 50,
    static_categoricals = ["investment_id_str","Cluster"],
    static_reals = ["weight_investment"],
    weight="time_id_counts_by_investment",
    time_varying_known_reals = ["time_id"]+columns,
    time_varying_unknown_reals = ["target"],
    #target_normalizer= GroupNormalizer(groups=["Cluster","investment_id_str"],transformation="softplus"),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True
    )
testing = TimeSeriesDataSet.from_dataset(testing,test,
                                            predict=True,
                                            stop_randomization=True
                                           )

test_dataloader = testing.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
"""

In [None]:
"""params = {k:v for k,v in testing.get_parameters().items() if not "target" in k}"""

In [None]:
"""!dir /kaggle/input"""

In [None]:
"""outside_data = pd.read_parquet("/kaggle/input/ubiquant-parquet/example_test.parquet")"""

In [None]:
"""outside_data = substitute_process(df=outside_data)"""

In [None]:
"""outside_data["target"] = 0
outside_data["Cluster"] = outside_data.Cluster.apply(str)
outside_data["time_id"] = outside_data.time_id.apply(int)"""

In [None]:
"""testing_2 = TimeSeriesDataSet(
    outside_data,
    time_idx = "time_id",
    target="target",
    group_ids = ["investment_id_str","Cluster"],
    min_encoder_length = 0,
    max_encoder_length = 20,
    min_prediction_length= 1,
    #lags= {"target":[a for a in range(1,5)]},
    max_prediction_length = 3,
    static_categoricals = ["investment_id_str","Cluster"],
    static_reals = ["weight_investment"],
    weight="time_id_counts_by_investment",
    time_varying_known_reals = ["time_id"]+columns,
    time_varying_unknown_reals = ["target"],
    #target_normalizer= GroupNormalizer(groups=["Cluster","investment_id_str"],transformation="softplus"),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True
    )
testing_2 = TimeSeriesDataSet.from_dataset(testing_2,outside_data,
                                            predict=True,
                                            stop_randomization=True
                                           )

test_2_dataloader = testing.to_dataloader(train=True, batch_size=1, num_workers=0)
"""