In [1]:
import pandas as pd

from __scripts__.data import (
    clean_data,
    check_corr,
    ColumnTransformer,
    do_pca,
    DataFrameTransformer,
    GroupAutoSplitter,
)
from __scripts__.plot import plot_feature_label_corr
from __scripts__.model import opt_base_model
from __scripts__.typ import DataType
from __scripts__.cross_val import CombinatorialPurgedKFold
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
def f_engg(df: pd.DataFrame):
    # Convert 'day' to datetime
    df['day'] = pd.to_datetime(df['day'].astype(str), format="%j", errors='coerce')
    
    # Extract temporal features
    df['month'] = df['day'].dt.month
    df['day_of_week'] = df['day'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    # Temperature features
    df['temp_range'] = df['maxtemp'] - df['mintemp']
    df['avg_temp'] = (df['maxtemp'] + df['mintemp']) / 2
    df['temp_deviation'] = df['temparature'] - df['avg_temp']
    
    # Dew point depression
    df['dew_point_depression'] = df['temparature'] - df['dewpoint']
    
    # Wind direction - sine and cosine transformation
    df['wind_dir_rad'] = np.deg2rad(df['winddirection'])
    df['wind_dir_sin'] = np.sin(df['wind_dir_rad'])
    df['wind_dir_cos'] = np.cos(df['wind_dir_rad'])
    df.drop(columns=['wind_dir_rad'], inplace=True)
    
    # Wind chill factor (simplified version)
    df['wind_chill'] = 13.12 + 0.6215 * df['temparature'] - 11.37 * (df['windspeed']**0.16) + 0.3965 * df['temparature'] * (df['windspeed']**0.16)
    
    # Interaction features
    df['humidity_temp'] = df['humidity'] * df['temparature']
    df['cloud_sunshine'] = df['cloud'] * df['sunshine']
    
    # Rolling statistical features
    df['rolling_temp_mean'] = df['avg_temp'].rolling(window=7).mean()
    df['rolling_wind_mean'] = df['windspeed'].rolling(window=7).mean()
    df['rolling_humidity_mean'] = df['humidity'].rolling(window=7).mean()
    
    # Lag features
    df['temp_lag_1'] = df['avg_temp'].shift(1)
    df['humidity_lag_1'] = df['humidity'].shift(1)
    df['windspeed_lag_1'] = df['windspeed'].shift(1)
    
    # Pressure-Temperature interaction
    df['pressure_temp_interaction'] = df['pressure'] * df['avg_temp']
    # Wind-Speed-Temperature interaction
    df['windspeed_temp_interaction'] = df['windspeed'] * df['avg_temp']
    
    # Sunshine-Cloud interaction
    df['sunshine_cloud_interaction'] = df['sunshine'] * df['cloud']
    
    # Season feature
    df['season'] = df['month'].apply(lambda x: 'Spring' if 3 <= x <= 5 else
                                      'Summer' if 6 <= x <= 8 else
                                      'Autumn' if 9 <= x <= 11 else 'Winter')

    for c in ['pressure', 'maxtemp', 'temparature', 'humidity']:
        for gap in [1]:
            df[c+f"_shift{gap}"] = df[c].shift(gap)
            df[c+f"_diff{gap}"] = df[c].diff(gap)

    # Binary encoding for season
    # df = pd.get_dummies(df, columns=['season'], drop_first=True)
    # Drop original 'day' column
    df.drop(columns=['day'], inplace=True)
    
    return df


df = pd.read_csv("playground_series_s5e3/train.csv")

df = f_engg(df)

df = clean_data(
    df,
    drop_missing_rows=False,
    drop_uninformative=True,
    add_new_features=False,
    print_data=True,
)

dtype_dict = DataType.infer_df_dtype(df)


# plot_feature_label_corr(df, y="rainfall")

[INFO] data.clean_data NOTE: NaN values may be present in data. Either drop missing values or specify an imputer for the Column Transformer.
[INFO] types.DataType.infer_df_dtype NOTE: Column 'season' (CATEGORICAL_NOMINAL_STRING) has unknown ordinality. If the column is ordinal, set it by `dtype_dict.set_ordinal('season')`


=#=#=#=#=#=#=#=#= SUMMARY =#=#=#=#=#=#=#=#=
=== SIZE ===
42 -> 41 columns
2190 -> 2190 rows

=== INCOMPLETENESS ===
14 -> 14 columns containing NaNs: {'rolling_temp_mean': 6, 'rolling_wind_mean': 6, 'rolling_humidity_mean': 6, 'temp_lag_1': 1, 'humidity_lag_1': 1, 'windspeed_lag_1': 1, 'pressure_shift1': 1, 'pressure_diff1': 1, 'maxtemp_shift1': 1, 'maxtemp_diff1': 1, 'temparature_shift1': 1, 'temparature_diff1': 1, 'humidity_shift1': 1, 'humidity_diff1': 1}
6 -> 6 rows containing NaNs
29 -> 29 total NaNs

=== DATA TYPES ===
                        Column                   Data Type                                              Values    Notes
                            id                  IDENTIFIER  (2190 unique values) 0, 1, 2, 3, 4, 5, 6, 7, 8,...  DROPPED
0                     pressure                     NUMERIC        min: 999.0, max: 1034.6, x̄: 1013.6, σ: 5.66         
1                      maxtemp                     NUMERIC            min: 10.4, max: 36.0, x̄: 26.37, σ: 5.6

In [3]:
Trans_X, Trans_Y, task = ColumnTransformer.create_transformers(
    df,
    labels="rainfall",
    time_series=True,
)


X_scaled = Trans_X.fit_transform(df, simple_imputer=SimpleImputer())
Y_scaled = Trans_Y.fit_transform(df)

# do_pca(X_scaled, Y_scaled, Trans_X, Trans_Y, task=task)

best_mdl = opt_base_model(X_scaled, Y_scaled, task=task)

KNeighborsClassifier: evaluating...:   0%|          | 0/14 [00:00<?, ?it/s]


Got splits: <generator object CombinatorialPurgedKFold.split at 0x7fdd5280f6f0>


ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

In [6]:
test_df = pd.read_csv("playground_series_s5e3/test.csv")
test_df = f_engg(test_df)

X_test_scaled = Trans_X.transform(test_df)
Y_test_pred = best_mdl.predict(X_test_scaled)
Y_test_pred = Y_test_pred.astype(int).ravel()

test_df["rainfall"] = Y_test_pred
test_df[["id", "rainfall"]].to_csv("playground_series_s5e3/answer.csv", index=False)
