# Let's go!
## Imports and Set Up
___

In [None]:
import warnings
from pathlib import Path
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.calibration import CalibrationDisplay
from sklearn.compose import (
    make_column_transformer
)
from sklearn.linear_model import (
    LogisticRegression
)
from sklearn.metrics import (
    get_scorer_names, roc_auc_score, roc_curve, RocCurveDisplay
)
from sklearn.model_selection import (
    LeaveOneGroupOut, cross_val_score,
    # TunedThresholdClassifierCV
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    MinMaxScaler,
    RobustScaler,
    FunctionTransformer,
    PolynomialFeatures
)

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf

import catboost as cb
import optuna
from optuna.samplers import TPESampler

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

sns.set_style("ticks")

INPUT_PATH = Path.cwd().parents[1] / 'kaggle/input/playground-series-s5e3'
TRAIN_PATH = INPUT_PATH / "train.csv"
TEST_PATH = INPUT_PATH / "test.csv"
SUB_PATH = INPUT_PATH / "sample_submission.csv"

In [None]:
X_data = pd.read_csv(TRAIN_PATH)
X_test = pd.read_csv(TEST_PATH)
y_test = pd.read_csv(SUB_PATH)

In [None]:
X_train = X_data.drop(columns=["id", "rainfall"]).copy()
y_train = X_data["rainfall"].copy()
X_test.drop(columns="id", inplace=True)

In [None]:
X_test["winddirection"] = X_test["winddirection"].interpolate()

## Markdown
___
**Observations** 
*  day sequence is messed up in training data - looks like it is just an error, rectifying it breaks continuity of featrues
*  Test set has one missing winddir value

**Assumptions**
* 

**To Do**
* [ ] **EDA**
    * [ ] adveserial validation
    * [X] visualise rainy days with heatmaps
* [ ] **FE**
    * [ ] target encoding
    * [ ] residual biining
    * [ ] meta features
    * [ ] feature importance with SHAP
    * [ ] clustering?
    * [X] delta features
    * [X] rolling features
    * [ ] seasonal decompose
* [ ] **Modelling**
    * [ ] stratified cv?
    * [ ] yearly groups for cv?
    * [ ] nested cv?
    * [ ] submit vote-predictions with final models from cross validation
    * [ ] Stacking
    * [ ] Calibration/Classification Reporting with skore


In [None]:
dummy_years = np.repeat(range(int(X_data.shape[0] / 365)), 365)
# X_train["dummy_year"] = dummy_years

# sorted_idx = X_train.sort_values(["dummy_year", "day"]).index
# X_train.sort_values(["dummy_year", "day"], ignore_index=True, inplace=True)
# X_train.drop(columns="dummy_year", inplace=True)
# y_train = y_train[sorted_idx].reset_index(drop=True)

X_data["day"] = np.tile(range(1,366), 6)
X_train["day"] = np.tile(range(1,366), 6)

## EDA
___
### dtype, nunique, notnulls

In [None]:
info_df = (
    pd.DataFrame(
        [
            X_train.dtypes,
            X_train.nunique(),
            X_train.notnull().sum(axis=0)
        ],
        index=["dtype", "nunique", "not_null"]
    )
    .T
    .sort_values("nunique", ascending=False)
)
info_df

In [None]:
X_test

In [None]:
_ = X_data.drop(columns=["id", "day"]).plot(subplots=True, figsize=(16,9))

In [None]:
_ = X_data.drop(columns=["id", "day", "rainfall"]).plot(
    subplots=True, figsize=(16,9), layout=(2,5), kind="box", sharex=False
)

### Target

In [None]:
y_train.value_counts(dropna=False)

In [None]:
day_count = X_data["day"].value_counts()
day_count.value_counts()

In [None]:
# day_count[day_count == 5].index.sort_values()
# day_count[day_count == 7].index.sort_values()

In [None]:
# Pivot the data and transpose so that days are the index
heatmap_data = (
    X_data[["day", "rainfall"]]
    .assign(dummy_year=dummy_years)
    .pivot_table(columns="day", index="dummy_year", values="rainfall", aggfunc='max')
    .T  # transpose so that rows are days and columns are years
)
heatmap_data = heatmap_data.sort_index()

# Define the quarters; these boundaries roughly split 365 days into 4 parts
quarters = {
    "Q1": (1, 91),
    "Q2": (92, 182),
    "Q3": (183, 273),
    "Q4": (274, 365)
}

# Create a figure with 4 subplots (one for each quarter)
fig, axes = plt.subplots(4, 1, figsize=(16, 9), tight_layout=True)

for ax, (q, (start, end)) in zip(axes, quarters.items()):
    # Slice the data for the current quarter
    quarter_data = heatmap_data.loc[(heatmap_data.index >= start) & (heatmap_data.index <= end)]
    
    # Plot the heatmap for this quarter
    sns.heatmap(
        quarter_data.T,
        ax=ax,
        cmap=sns.color_palette(["gold", "dimgrey"]),
        cbar=False,
        lw=0.1,
        linecolor="k",
    )
    
    # ax.set_title(q)
    ax.set_ylabel("Year")
    ax.set_xlabel("")
ax.set_xlabel("Day of Year")

In [None]:
(
    X_data[["day", "rainfall"]]
    .assign(dummy_year=dummy_years)
    .groupby("dummy_year")["day"].nunique()
)

## FE
___

In [None]:
# def drop_day(df:pd.DataFrame):
#     return df.drop(columns="day")

# drop_day_ft = FunctionTransformer(
#     drop_day, feature_names_out="one-to-one"
# )

# drop_day_ft.transform(X_train)

In [None]:
def sin_cos(df:pd.DataFrame):
    sin_part = np.sin(2 * np.pi * df["day"] / 365)
    cos_part = np.cos(2 * np.pi * df["day"] / 365)
    return np.column_stack([sin_part, cos_part])

sin_cos_ft = FunctionTransformer(
    sin_cos, feature_names_out=lambda self, names_in: ["day_sin", "day_cos"]
)

# sin_cos_ft.transform(X_train).shape

In [None]:
def wind_sin_cos(df:pd.DataFrame):
    wind_sin = np.sin(2 * np.pi * df["winddirection"] / 360) * df["windspeed"]
    wind_cos = np.cos(2 * np.pi * df["winddirection"] / 360) * df["windspeed"]
    return np.column_stack([wind_sin, wind_cos])

wind_sin_cos_ft = FunctionTransformer(
    wind_sin_cos, feature_names_out=lambda self, names_in: ["wind_sin", "wind_cos"]
)

# wind_sin_cos_ft.fit_transform(X_train)
# sns.relplot(
#     (np.cos(2 * np.pi * X_train["winddirection"] / 360) * X_train["windspeed"]),
#     kind="line", aspect=3,height=3)

In [None]:
def roll_mean(df:pd.DataFrame, window=7):
    return (
        df
        .rolling(window)
        .mean()
        .bfill()
    )

roll_mean7_ft = FunctionTransformer(
    roll_mean, kw_args={"window": 7},
    feature_names_out=lambda self, names_in: [c + "_roll_mean7" for c in names_in]
)

# roll_mean7_ft.transform(X_train).head(10)

In [None]:
LAG_PERIODS = range(-6, 1)[::-1]
def lag_feat(df:pd.DataFrame, periods=LAG_PERIODS):
    return (
        df
        .shift(periods)
        .ffill()
    )

def lag_feat_names_out(names_in):
    names_out = []
    for lag in LAG_PERIODS:
        names_out.extend([c + f"_lag{lag}" for c in names_in])
    return names_out
    
lag_feat_ft = FunctionTransformer(
    lag_feat, 
    feature_names_out=lambda self, names_in: lag_feat_names_out(names_in)
)

# lag_feat_ft.transform(X_train)

In [None]:
def delta_temp(df:pd.DataFrame, window=7, periods=LAG_PERIODS):
    delta = df["maxtemp"] - df["mintemp"]
    delta = delta.to_frame("delta")
    delta_roll_mean = (
        delta
        .rolling(window)
        .mean()
        .bfill()
    )
    delta_lag = (
        delta
        .shift(periods)
        .ffill()
    )
    return pd.concat([delta_roll_mean, delta_lag], axis=1)

def delta_temp_names_out():
    names_out = ["temp_delta_roll_mean7"]
    names_out += [f"tem_delta_lag{lag}" for lag in LAG_PERIODS]
    return names_out

delta_temp_ft = FunctionTransformer(
    delta_temp, feature_names_out=lambda self, names_in: delta_temp_names_out()
)

# delta_temp_ft.fit_transform(X_train)

In [None]:
pre_proc_t = make_column_transformer(
    # (drop_day_ft, X_train.columns),
    (sin_cos_ft, ["day"]),
    (wind_sin_cos_ft, X_train.columns),
    (roll_mean7_ft, X_train.drop(columns="day").columns),
    (lag_feat_ft, X_train.drop(columns="day").columns),
    (delta_temp_ft, X_train.columns),
    # (make_pipeline(delta_temp_ft, roll_mean7_ft), ["mintemp", "maxtemp"]),
    # (make_pipeline(delta_temp_ft, lag_feat_ft), ["mintemp", "maxtemp"]),
)

pre_proc_t.fit(X_train, y_train)

In [None]:
t_ = pd.DataFrame(
    pre_proc_t.transform(X_train),
    columns=pre_proc_t.get_feature_names_out()
)

t_ = pd.DataFrame(
    pre_proc_t.transform(X_test),
    columns=pre_proc_t.get_feature_names_out()
)

In [None]:
decomposition = seasonal_decompose(y_train, period=365)

In [None]:
decompose_df = pd.DataFrame(
    {
        "day": X_train["day"],
        "trend": decomposition.trend,
        "seasonal": decomposition.seasonal,
    }
)
decompose_df["reconstruct"] = decompose_df["trend"].fillna(0) + decompose_df["seasonal"].fillna(0)

In [None]:
ave_reconstruct = decompose_df.groupby("day")["reconstruct"].mean()

## Modelling
___

In [None]:
model_pl = make_pipeline(
    pre_proc_t,
    # RobustScaler(),
    MinMaxScaler((-1,1)),
    LogisticRegression(
        solver="saga", penalty="elasticnet", max_iter=1000, random_state=1717,
        C=0.12850774510045, l1_ratio=0.9560651431484171
    )
)

In [None]:
model_pl.fit(X_train[:365*5], y_train[:365*5])

In [None]:
weights = pd.Series(
    model_pl[-1].coef_.ravel()
)
weights.index = (
    pd
    .Series(pre_proc_t.get_feature_names_out())
    .apply(lambda x: x.split("__")[-1])
)
weights.plot(kind="bar", figsize=(20,3))

In [None]:
cross_val_score(model_pl, X_train, y_train, groups=dummy_years, scoring="roc_auc", cv=LeaveOneGroupOut())

In [None]:
# def objective(trial):
#     C = trial.suggest_float("C", 1e-4, 1e4, log=True)
#     l1_ratio = trial.suggest_float("l1_ratio",0.1, 1.0, log=False)
#     model_pl.set_params(
#         logisticregression__C=C,
#         logisticregression__l1_ratio=l1_ratio,
        
#     )
#     return cross_val_score(
#         model_pl, X_train, y_train,
#         groups=dummy_years, scoring="roc_auc",
#         cv=LeaveOneGroupOut(),
#         n_jobs=-1,
#     ).mean()

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50)
# study.best_params

In [None]:
RocCurveDisplay.from_estimator(
   model_pl, X_train[365*5:], y_train[365*5:]
)

In [None]:
CalibrationDisplay.from_estimator(
    model_pl, X_train[365*5:], y_train[365*5:],
    n_bins=30, strategy="quantile"
)

## Submission
___

In [None]:
model_pl.fit(X_train, y_train)
y_test["rainfall"] = model_pl.predict_proba(X_test)[:,1]
y_test.to_csv('submission.csv', index=False)

In [None]:
y_test.head()
