# Let's go!
## Imports and Set Up
___

In [None]:
import warnings
from pathlib import Path
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config
from sklearn.base import (
    BaseEstimator, TransformerMixin
)
from sklearn.calibration import CalibrationDisplay, CalibratedClassifierCV
from sklearn.compose import (
    make_column_transformer
)
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import (
    LogisticRegression, SGDClassifier
)
from sklearn.manifold import TSNE
from sklearn.metrics import (
    get_scorer_names, roc_auc_score, roc_curve, RocCurveDisplay
)
from sklearn.model_selection import (
    LeaveOneGroupOut, cross_val_score, KFold
    # TunedThresholdClassifierCV
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    MinMaxScaler,
    RobustScaler,
    FunctionTransformer,
    PolynomialFeatures
)

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf

import catboost as cb
import optuna
from optuna.samplers import TPESampler

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

sns.set_style("ticks")

set_config(transform_output="pandas")

INPUT_PATH = Path.cwd().parents[1] / 'kaggle/input/playground-series-s5e3'
TRAIN_PATH = INPUT_PATH / "train.csv"
TEST_PATH = INPUT_PATH / "test.csv"
SUB_PATH = INPUT_PATH / "sample_submission.csv"

OPTIMIZE = False

In [None]:
X_data = pd.read_csv(TRAIN_PATH)
X_test = pd.read_csv(TEST_PATH)
y_test = pd.read_csv(SUB_PATH)

In [None]:
X_train = X_data.drop(columns=["id", "rainfall"]).copy()
y_train = X_data["rainfall"].copy()
X_test.drop(columns="id", inplace=True)

In [None]:
X_test["winddirection"] = X_test["winddirection"].interpolate()

## Markdown
___
**Observations** 
*  Rectifying messed up day sequence is in training data breaks continuity of featrues
*  Test set has one missing winddir value

**Assumptions**
* day sequence discontinuity in years 2 and 3 is an error 

**To Do**
* [ ] **EDA**
    * [ ] adveserial validation
    * [X] visualise rainy days with heatmaps
* [ ] **FE**
    * [ ] target encoding
    * [ ] residual binning
    * [ ] meta features
    * [ ] feature importance with SHAP
    * [ ] clustering?
    * [X] delta features
    * [X] rolling features
    * [ ] seasonal decompose
* [ ] **Modelling**
    * [X] yearly groups for cv?
    * [ ] nested cv?
    * [ ] submit vote-predictions with final models from cross validation
    * [ ] Stacking
    * [ ] Calibration
    * [ ] Tuning threshold
    * [ ] Classification Reporting with skore


In [None]:
dummy_years = np.repeat(range(int(X_data.shape[0] / 365)), 365)
# X_train["dummy_year"] = dummy_years

# sorted_idx = X_train.sort_values(["dummy_year", "day"]).index
# X_train.sort_values(["dummy_year", "day"], ignore_index=True, inplace=True)
# X_train.drop(columns="dummy_year", inplace=True)
# y_train = y_train[sorted_idx].reset_index(drop=True)

X_data["day"] = np.tile(range(1,366), 6)
X_train["day"] = np.tile(range(1,366), 6)

## EDA
___
### dtype, nunique, notnulls

In [None]:
info_df = (
    pd.DataFrame(
        [
            X_train.dtypes,
            X_train.nunique(),
            X_train.notnull().sum(axis=0)
        ],
        index=["dtype", "nunique", "not_null"]
    )
    .T
    .sort_values("nunique", ascending=False)
)
info_df

In [None]:
_ = X_data.drop(columns=["id", "day"]).plot(subplots=True, figsize=(16,9))

In [None]:
_ = X_data.drop(columns=["id", "day", "rainfall"]).plot(
    subplots=True, figsize=(16,9), layout=(2,5), kind="box", sharex=False
)

### Target

In [None]:
y_train.value_counts(dropna=False)

In [None]:
day_count = X_data["day"].value_counts()
day_count.value_counts()

In [None]:
# day_count[day_count == 5].index.sort_values()
# day_count[day_count == 7].index.sort_values()

In [None]:
# Pivot the data and transpose so that days are the index
heatmap_data = (
    X_data[["day", "rainfall"]]
    .assign(dummy_year=dummy_years)
    .pivot_table(columns="day", index="dummy_year", values="rainfall", aggfunc='max')
    .T  # transpose so that rows are days and columns are years
)
heatmap_data = heatmap_data.sort_index()

# Define the quarters; these boundaries roughly split 365 days into 4 parts
quarters = {
    "Q1": (1, 91),
    "Q2": (92, 182),
    "Q3": (183, 273),
    "Q4": (274, 365)
}

# Create a figure with 4 subplots (one for each quarter)
fig, axes = plt.subplots(4, 1, figsize=(16, 9), tight_layout=True)

for ax, (q, (start, end)) in zip(axes, quarters.items()):
    # Slice the data for the current quarter
    quarter_data = heatmap_data.loc[(heatmap_data.index >= start) & (heatmap_data.index <= end)]
    
    # Plot the heatmap for this quarter
    sns.heatmap(
        quarter_data.T,
        ax=ax,
        cmap=sns.color_palette(["gold", "dimgrey"]),
        cbar=False,
        lw=0.1,
        linecolor="k",
    )
    
    # ax.set_title(q)
    ax.set_ylabel("Year")
    ax.set_xlabel("")
ax.set_xlabel("Day of Year")

In [None]:
(
    X_data[["day", "rainfall"]]
    .assign(dummy_year=dummy_years)
    .groupby("dummy_year")["day"].nunique()
)

In [None]:
ts = y_train.copy()
ts.index = pd.date_range(start="2000-01-01", periods=len(y_train), freq="D")
decomposition = seasonal_decompose(ts.resample("w").mean(), model="additive")
fig = decomposition.plot()
fig.set_figheight(9)
fig.set_figwidth(16)

In [None]:
_ = plot_acf(ts.resample("w").mean(), lags=range(1,60))

## FE
___

In [None]:
# def drop_day(df:pd.DataFrame):
#     return df.drop(columns="day")

# drop_day_ft = FunctionTransformer(
#     drop_day, feature_names_out="one-to-one"
# )

# drop_day_ft.transform(X_train)

In [None]:
def sin_cos(df:pd.DataFrame):
    sin_part = np.sin(2 * np.pi * df["day"] / 365)
    cos_part = np.cos(2 * np.pi * df["day"] / 365)
    return np.column_stack([sin_part, cos_part])

sin_cos_ft = FunctionTransformer(
    sin_cos, feature_names_out=lambda self, names_in: ["day_sin", "day_cos"]
)

# sin_cos_ft.transform(X_train).shape

In [None]:
def wind_sin_cos(df:pd.DataFrame):
    wind_sin = np.sin(2 * np.pi * df["winddirection"] / 360) * df["windspeed"]
    wind_cos = np.cos(2 * np.pi * df["winddirection"] / 360) * df["windspeed"]
    return np.column_stack([wind_sin, wind_cos])

wind_sin_cos_ft = FunctionTransformer(
    wind_sin_cos, feature_names_out=lambda self, names_in: ["wind_sin", "wind_cos"]
)

# wind_sin_cos_ft.fit_transform(X_train)
# sns.relplot(
#     (np.cos(2 * np.pi * X_train["winddirection"] / 360) * X_train["windspeed"]),
#     kind="line", aspect=3,height=3)

In [None]:
def roll_mean(df:pd.DataFrame, window=7):
    return (
        df
        .rolling(window)
        .mean()
        .bfill()
    )

roll_mean7_ft = FunctionTransformer(
    roll_mean, kw_args={"window": 7},
    feature_names_out=lambda self, names_in: [c + "_roll_mean7" for c in names_in]
)

# roll_mean7_ft.transform(X_train).head(10)

In [None]:
LAG_PERIODS = range(-6, 1)[::-1]
def lag_feat(df:pd.DataFrame, periods=LAG_PERIODS):
    return (
        df
        .shift(periods)
        .ffill()
    )

def lag_feat_names_out(names_in):
    names_out = []
    for lag in LAG_PERIODS:
        names_out.extend([c + f"_lag{lag}" for c in names_in])
    return names_out
    
lag_feat_ft = FunctionTransformer(
    lag_feat, 
    feature_names_out=lambda self, names_in: lag_feat_names_out(names_in)
)

# lag_feat_ft.transform(X_train)

In [None]:
def delta_temp(df:pd.DataFrame, window=7, periods=LAG_PERIODS):
    delta = df["maxtemp"] - df["mintemp"]
    delta = delta.to_frame("delta")
    delta_roll_mean = (
        delta
        .rolling(window)
        .mean()
        .bfill()
    )
    delta_lag = (
        delta
        .shift(periods)
        .ffill()
    )
    return pd.concat([delta_roll_mean, delta_lag], axis=1)

def delta_temp_names_out():
    names_out = ["temp_delta_roll_mean7"]
    names_out += [f"tem_delta_lag{lag}" for lag in LAG_PERIODS]
    return names_out

delta_temp_ft = FunctionTransformer(
    delta_temp, feature_names_out=lambda self, names_in: delta_temp_names_out()
)

# delta_temp_ft.fit_transform(X_train)

In [None]:
class SeasonalReconstructor(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        ts = y.loc[X.index]
        decomposition = seasonal_decompose(ts, period=365)
        decompose_df = (
            pd
            .DataFrame(
                {
                    "day": X["day"],
                    "trend": decomposition.trend,
                    "seasonal": decomposition.seasonal,
                }
            )
            .dropna()
            .assign(reconstruct = lambda df: df["trend"] + df["seasonal"])
        )
    
        ave = decompose_df.groupby("day")["reconstruct"].mean()
        # med = decompose_df.groupby("day")["reconstruct"].median()
    
        self.stat_df_=pd.DataFrame({"ave_reconstruct":ave}, index=ave.index)
        return self

    def transform(self, X):
        return (
            X
            .merge(self.stat_df_, left_on="day", right_index=True, how="left")
            .iloc[:,-1:]
        )

    def get_feature_names_out(self, names_in):
        return ["ave_reconstruct"]

# sr = SeasonalReconstructor().fit(X_train[:365*5], y_train[:365*5])
# sr.transform(X_train[365 * 5:])

In [None]:
# def decompose(df:pd.DataFrame, ts=y_train):
#     ts = y_train.loc[df.index]
#     decomposition = seasonal_decompose(ts, period=365)
#     decompose_df = (
#         pd
#         .DataFrame(
#             {
#                 "day": df["day"],
#                 "trend": decomposition.trend,
#                 "seasonal": decomposition.seasonal,
#             }
#         )
#         .fillna(0)
#         .assign(reconstruct = lambda df: df["trend"] + df["seasonal"])
#     )

#     ave = decompose_df.groupby("day")["reconstruct"].mean()
#     # med = decompose_df.groupby("day")["reconstruct"].median()

#     stat_df=pd.DataFrame({"ave_reconstruct":ave}, index=ave.index)
#     return (
#         df
#         .merge(stat_df, left_on="day", right_index=True, how="left")
#         .iloc[:,-1:]
#     )

# decompose_ft = FunctionTransformer(
#     decompose, feature_names_out=lambda self, names_in: ["ave_reconstruct"]
# )

# decompose_ft.fit_transform(X_train[:365])

In [None]:
pre_proc_t = make_column_transformer(
    # (drop_day_ft, X_train.columns),
    (sin_cos_ft, ["day"]),
    (wind_sin_cos_ft, X_train.columns),
    (roll_mean7_ft, X_train.drop(columns="day").columns),
    (lag_feat_ft, X_train.drop(columns="day").columns),
    (delta_temp_ft, X_train.columns),
    # (SeasonalReconstructor(), X_train.columns),
    # (make_pipeline(delta_temp_ft, roll_mean7_ft), ["mintemp", "maxtemp"]),
    # (make_pipeline(delta_temp_ft, lag_feat_ft), ["mintemp", "maxtemp"]),
)

pre_proc_t.fit(X_train, y_train)

In [None]:
t_ = pd.DataFrame(
    pre_proc_t.transform(X_train),
    columns=pre_proc_t.get_feature_names_out()
)

t_ = pd.DataFrame(
    pre_proc_t.transform(X_test),
    columns=pre_proc_t.get_feature_names_out()
)

## Modelling
___

In [None]:
model_pl = make_pipeline(
    pre_proc_t,
    # RobustScaler(),
    MinMaxScaler((-1,1)),
    LogisticRegression(
        solver="saga", penalty="elasticnet", max_iter=1000, random_state=1717,
        C=0.12850774510045, l1_ratio=0.9560651431484171  # MinMaxScaler((-1,1))
        # C=0.04109620348258887, l1_ratio=0.6090629020683745  # RobustScaler()
        # **{'C': 0.019421863690341875, 'l1_ratio': 0.15600154583485648}
    )
)

In [None]:
X_train_pp = model_pl[:2].fit_transform(X_train)
X_test_pp = model_pl[:2].transform(X_test)

In [None]:
# pca = PCA()
# tsne = TSNE(perplexity=50, n_components=2, n_jobs=-1)
# X_train_pp_red = pca.fit_transform(X_train)
# X_train_pp_red = tsne.fit_transform(X_train)
# np.cumsum(pca.explained_variance_ratio_)
# sns.scatterplot(x=X_train_pp_red[:,0], y=X_train_pp_red[:,1], hue=y_train)

In [None]:
model_pl.fit(X_train[:365*5], y_train[:365*5])

In [None]:
model_pl[-1].n_iter_

In [None]:
weights = pd.Series(
    model_pl[-1].coef_.ravel()
)
weights.index = (
    pd
    .Series(pre_proc_t.get_feature_names_out())
    .apply(lambda x: x.split("__")[-1])
)
weights.plot(kind="bar", figsize=(20,3))

In [None]:
weights[weights.abs() > 0].sort_values(key=np.abs, ascending=False)

In [None]:
sfs = SequentialFeatureSelector(LogisticRegression(), cv=6, scoring="roc_auc", n_features_to_select="auto", tol=1e-6)
sfs.fit(X_train_pp, y_train)

In [None]:
cvs = cross_val_score(LogisticRegression(), X_train_pp.loc[:, sfs.get_support()], y_train, groups=dummy_years, scoring="roc_auc", cv=LeaveOneGroupOut())
cvs.mean()

In [None]:
cvs = cross_val_score(model_pl, X_train, y_train, groups=dummy_years, scoring="roc_auc", cv=LeaveOneGroupOut())
cvs.mean()

In [None]:
sfs.get_support().sum()

In [None]:
# def objective(trial):
#     model = LogisticRegression(
#          solver="saga", penalty="elasticnet", max_iter=1000, random_state=1717,
#     )
#     C = trial.suggest_float("C", 1e-4, 1e4, log=True)
#     l1_ratio = trial.suggest_float("l1_ratio",0.1, 1.0, log=False)
#     model.set_params(C=C, l1_ratio=l1_ratio)
#     return cross_val_score(
#         model, X_train_pp.loc[:, sfs.get_support()], y_train,
#         groups=dummy_years, scoring="roc_auc",
#         cv=LeaveOneGroupOut(),
#         n_jobs=-1,
#     ).mean()
    
# sampler=TPESampler(n_startup_trials=30)
# study = optuna.create_study(direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=100)
# study.best_params

In [None]:
# def objective(trial):
#     C = trial.suggest_float("C", 1e-4, 1e4, log=True)
#     l1_ratio = trial.suggest_float("l1_ratio",0.1, 1.0, log=False)
#     model_pl.set_params(
#         logisticregression__C=C,
#         logisticregression__l1_ratio=l1_ratio,
        
#     )
#     return cross_val_score(
#         model_pl, X_train, y_train,
#         groups=dummy_years, scoring="roc_auc",
#         cv=LeaveOneGroupOut(),
#         n_jobs=-1,
#     ).mean()
    
# sampler=TPESampler(n_startup_trials=10)
# study = optuna.create_study(direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=30)
# study.best_params

In [None]:
# def cat_objective(trial):
#     params = {
#         "catboostclassifier__iterations": trial.suggest_int("iterations", 10, 1000),
#         "catboostclassifier__depth": trial.suggest_int("depth", 1, 8),
#         "catboostclassifier__learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 1.0),
#         "catboostclassifier__random_strength": trial.suggest_loguniform("random_strength", 1e-9, 10.0),
#         "catboostclassifier__bagging_temperature": trial.suggest_uniform("bagging_temperature", 0.0, 1.0),
#         "catboostclassifier__border_count": trial.suggest_int("border_count", 1, 255),
#         "catboostclassifier__l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 30),
#     }
#     cat_model = make_pipeline(
#         pre_proc_t,
#         cb.CatBoostClassifier(
#             random_seed=1717,
#             verbose=False,
#             # task_type="GPU"
#         )
#     )
#     cat_model.set_params(**params)
#     return cross_val_score(
#         cat_model, X_train, y_train,
#         groups=dummy_years, scoring="roc_auc",
#         cv=LeaveOneGroupOut(),
#         n_jobs=-1,
#     ).mean()

# sampler=TPESampler(n_startup_trials=10)
# study = optuna.create_study(direction="maximize", sampler=sampler)
# study.optimize(cat_objective, n_trials=30)
# study.best_params

In [None]:
# optuna.visualization.plot_parallel_coordinate(study)
# optuna.visualization.plot_optimization_history(study)

In [None]:
# model_pl = make_pipeline(
#     pre_proc_t,
#     cb.CatBoostClassifier(
#         random_seed=1717,
#         verbose=False,
#         iterations= 136,
#         depth=6,
#         learning_rate=0.015382614140324135,
#         random_strength=0.17218209541516394,
#         bagging_temperature=0.5774026984560452,
#         border_count=76,
#         l2_leaf_reg=26
#         # task_type="GPU"
#     )
# )
# model_pl.fit(X_train[:365*5], y_train[:365*5])

In [None]:
X_train_ = X_train_pp.loc[:, sfs.get_support()][:365*5]
y_train_ = y_train[:365*5]
X_val_ = X_train_pp.loc[:, sfs.get_support()][365*5:]
y_val_ = y_train[365*5:]

simple_model = LogisticRegression(
     solver="saga", penalty="elasticnet", max_iter=1000, random_state=1717,
    **{'C': 2.175016987873072, 'l1_ratio': 0.39144004426151213}
)
simple_model.fit(X_train_, y_train_)

In [None]:
est_remedy = CalibratedClassifierCV(
    simple_model, cv=KFold(5), method="isotonic"
).fit(X_train_, y_train_)

fig, axes = plt.subplots(2,2, sharex=True, sharey=True, figsize=(10,6), tight_layout=True)

RocCurveDisplay.from_estimator(
   simple_model, X_val_, y_val_, ax=axes[0, 0]
)
RocCurveDisplay.from_estimator(
   est_remedy, X_val_, y_val_, ax=axes[0, 1]
)

CalibrationDisplay.from_estimator(
    simple_model, X_val_, y_val_,
    n_bins=25, strategy="quantile", ax=axes[1, 0]
)
CalibrationDisplay.from_estimator(
    est_remedy, X_val_, y_val_,
    n_bins=25, strategy="quantile", ax=axes[1, 1]
)

In [None]:
# est_remedy = CalibratedClassifierCV(
#     model_pl, cv=KFold(5), method="isotonic"
# ).fit(X_train[:365*5], y_train[:365*5])

# fig, axes = plt.subplots(2,2, sharex=True, sharey=True, figsize=(10,6), tight_layout=True)

# RocCurveDisplay.from_estimator(
#    model_pl, X_train[365*5:], y_train[365*5:], ax=axes[0, 0]
# )
# RocCurveDisplay.from_estimator(
#    est_remedy, X_train[365*5:], y_train[365*5:], ax=axes[0, 1]
# )

# CalibrationDisplay.from_estimator(
#     model_pl, X_train[365*5:], y_train[365*5:],
#     n_bins=25, strategy="quantile", ax=axes[1, 0]
# )
# CalibrationDisplay.from_estimator(
#     est_remedy, X_train[365*5:], y_train[365*5:],
#     n_bins=25, strategy="quantile", ax=axes[1, 1]
# )

## Submission
___

In [None]:
# est_remedy = CalibratedClassifierCV(
#     model_pl, cv=KFold(6), method="isotonic"
# ).fit(X_train, y_train)

In [None]:
simple_model.fit(X_train_pp.loc[:, sfs.get_support()], y_train)

In [None]:
# model_pl.fit(X_train, y_train)
# y_test["rainfall"] = model_pl.predict_proba(X_test)[:,1]
y_test["rainfall"] = simple_model.predict_proba(X_test_pp.loc[:, sfs.get_support()])[:,1]
y_test.to_csv('submission.csv', index=False)

In [None]:
y_test.head()
