In [1]:
%pip install pytorch_tabnet
%pip install lightgbm
%pip install catboost

/home/lvt/.local/share/virtualenvs/ml-TQx6uCZG/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.
/home/lvt/.local/share/virtualenvs/ml-TQx6uCZG/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.
/home/lvt/.local/share/virtualenvs/ml-TQx6uCZG/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import numpy as np
import pandas as pd
import csv
import torch
import torch.nn as nn
import sklearn

from torch import optim
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import warnings
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
warnings.filterwarnings('ignore')

print(pd.__version__)
print(np.__version__)
print(csv.__version__)
print(torch.__version__)
print(sklearn.__version__)

2.1.4
1.26.4
1.0
2.5.1+cu124
1.4.2


## Dataset splitting

In [3]:
train_csv = "./X_train_clean.csv"
train_temp_csv = "./train_temp.csv"
val_csv = "./val.csv"
test_csv  = "./X_test_clean.csv"

In [4]:
def split_csv_by_fraction(
        input_file,
        train_temp_output,
        val_output,
        train_output,
    ):
    chunks = pd.read_csv(input_file, chunksize=50_000, float_precision='round_trip')
    df = pd.concat(
        # Extraire 20 % (aléatoire) des données
        (chunk.sample(frac=0.2) for chunk in chunks),
        ignore_index=True,
    )
    df['piezo_measurement_date'] = pd.to_datetime(df['piezo_measurement_date'])
    month = df['piezo_measurement_date'].dt.month
    year = df['piezo_measurement_date'].dt.year

    # été 2020 + mai, octobre 2020-2023 : données d'entraînement pour la validation
    train_temp_df = df[
        ((year == 2020) & (month >= 6) & (month <= 9))
        | ((month == 5) | (month == 10))
    ].sample(frac=1, random_state=42)

    # été 2021 + avril, novembre 2020-2023 : données de validation pour les tests initiaux
    val_df = df[
        ((year == 2021) & (month >= 6) & (month <= 9))
        | ((month == 4) | (month == 11))
    ].sample(frac=1, random_state=42)

    # étés 2020-2021 + mai, octobre 2020-2023 : données d'entraînement pour la soumission
    train_df = df[
        (((year == 2020) | (year == 2021)) & (month >= 6) & (month <= 9))
        | ((month == 5) | (month == 10))
    ].sample(frac=1, random_state=42)

    print(
        'Train set for validation:', train_temp_df.shape,
        'Validation set:', val_df.shape,
        'Train set for submission:', train_df.shape,
    )
    train_temp_df.to_csv(train_temp_output, index=False)
    val_df.to_csv(val_output, index=False)
    train_df.to_csv(train_output, index=False)
    print(f"Saved {train_temp_output}, {val_output} and {train_output}")

In [5]:
split_csv_by_fraction(train_csv, train_temp_csv, val_csv, "train.csv")

Train set for validation: (753470, 64) Validation set: (765993, 64) Train set for submission: (1037140, 64)
Saved ./train_temp.csv, ./val.csv and train.csv


In [6]:
class_mapping = {
    "Very Low": 0,
    "Low": 1,
    "Average": 2,
    "High": 3,
    "Very High": 4
}

inverse_class_mapping = {v: k for k, v in class_mapping.items()}

In [7]:


df = pd.read_csv(train_csv, nrows=5*10**5)

class_counts = df['piezo_groundwater_level_category'].value_counts()
print(class_counts)

# Calculate weights
total = class_counts.sum()
class_weights = class_counts / total

# Print the weights
print()
print(class_weights)
print()
print(1/class_weights)

piezo_groundwater_level_category
Very High    113553
High         113122
Average      107547
Low           96607
Very Low      69171
Name: count, dtype: int64

piezo_groundwater_level_category
Very High    0.227106
High         0.226244
Average      0.215094
Low          0.193214
Very Low     0.138342
Name: count, dtype: float64

piezo_groundwater_level_category
Very High    4.403230
High         4.420007
Average      4.649130
Low          5.175608
Very Low     7.228463
Name: count, dtype: float64


In [8]:
class TabularDataset(Dataset):
    def __init__(self, csv_file, label_column=None):
        self.data_frame = pd.read_csv(csv_file)
        self.label_column = label_column

        self.data_frame = self.data_frame.drop(columns=['meteo_radiation_IR'], errors='ignore')

        if self.label_column:
            self.data_frame[self.label_column] = self.data_frame[self.label_column].astype(str)
            self.labels = self.data_frame[self.label_column].map(class_mapping)
            self.labels = self.labels.astype(int).values
            self.features = self.data_frame.drop(columns=[self.label_column])
        else:
            self.features = self.data_frame
            self.labels = None

        self.encoders = {}
        for col in self.features.columns:
            if self.features[col].dtype == 'object' or not pd.api.types.is_numeric_dtype(self.features[col]):
                encoder = LabelEncoder()
                self.features[col] = encoder.fit_transform(self.features[col].astype(str))
                self.encoders[col] = encoder

        self.features = self.features.apply(pd.to_numeric, errors='coerce')
        self.features = self.features.fillna(self.features.mean())
        self.features = self.features.values

    def __len__(self):
        return len(self.features)

    def bruh(self):
        print("Unique raw values in label column:", self.data_frame[self.label_column].unique())
        print("Label dtype:", self.labels.dtype if self.labels is not None else "No labels")
        print("First 10 labels:", self.labels[:10] if self.labels is not None else "No labels")

        nan_counts = self.data_frame.isna().sum()
        nan_columns = nan_counts[nan_counts > 0]  # Filter columns with NaN values
        if len(nan_columns) > 0:
            print("Columns with NaN values:")
            print(nan_columns)
        else:
            print("No NaN values in features.")

        print("NaN values in features:", np.any(np.isnan(self.features)))
        if self.labels is not None:
            print("NaN values in labels:", np.any(np.isnan(self.labels)))

    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float32)
        if self.labels is not None:
            label = torch.tensor(self.labels[idx], dtype=torch.long)
            return features, label
        return features


In [9]:
def vector_to_class(x):
  y = torch.argmax(x,axis=1)
  return y

def prediction_accuracy(predict,labels):
  accuracy = (predict == labels).sum()/(labels.shape[0])
  return accuracy

In [10]:
class_weights_tensor = torch.tensor([3.866946, 3.999680, 4.492605, 5.817200, 10.321409])

class WeightedCrossEntropyLoss(nn.Module):
    def __init__(self, class_weights):
        super(WeightedCrossEntropyLoss, self).__init__()
        self.class_weights = class_weights

    def forward(self, y_pred, y_true):
        return nn.CrossEntropyLoss(weight=self.class_weights)(y_pred, y_true)

In [11]:
train_dataset = TabularDataset(csv_file=train_temp_csv, label_column='piezo_groundwater_level_category')
val_dataset = TabularDataset(csv_file=val_csv, label_column='piezo_groundwater_level_category')

train_features = train_dataset.features
val_features = val_dataset.features
train_labels = train_dataset.labels
val_labels = val_dataset.labels

In [12]:
"""
tabnet_model = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params=dict(step_size=50, gamma=0.9),
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type="entmax"
)

# Replace loss function with the custom weighted loss
tabnet_model.loss_fn = WeightedCrossEntropyLoss(class_weights=class_weights_tensor)

# Train TabNet
tabnet_model.fit(
    X_train=train_features, y_train=train_labels,
    eval_set=[(val_features, val_labels)],
    max_epochs=15, patience=20, batch_size=1024, virtual_batch_size=128
)
"""

'\ntabnet_model = TabNetClassifier(\n    optimizer_fn=torch.optim.Adam,\n    optimizer_params=dict(lr=2e-2),\n    scheduler_params=dict(step_size=50, gamma=0.9),\n    scheduler_fn=torch.optim.lr_scheduler.StepLR,\n    mask_type="entmax"\n)\n\n# Replace loss function with the custom weighted loss\ntabnet_model.loss_fn = WeightedCrossEntropyLoss(class_weights=class_weights_tensor)\n\n# Train TabNet\ntabnet_model.fit(\n    X_train=train_features, y_train=train_labels,\n    eval_set=[(val_features, val_labels)],\n    max_epochs=15, patience=20, batch_size=1024, virtual_batch_size=128\n)\n'

In [13]:
xgb_model = xgb.XGBClassifier(tree_method="gpu_hist", reg_alpha=0.5)

xgb_model.fit(train_dataset.features, train_dataset.labels)

val_preds = xgb_model.predict(val_features)
val_accuracy = accuracy_score(val_labels, val_preds)
print(f"Validation Accuracy: {val_accuracy}")

Validation Accuracy: 0.5586800401570249


In [14]:
train_data = lgb.Dataset(train_features, label=train_labels)
val_data = lgb.Dataset(val_features, label=val_labels, reference=train_data)

# Set hyperparameters
params = {
    "objective": "multiclass",
    "num_class": len(set(train_labels)),  # Number of unique classes
    "metric": "multi_logloss",
    "boosting_type": "gbdt",
    "learning_rate": 0.01,
    "max_depth": 100,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "lambda_l1": 1,
    "lambda_l2": 1,
}

# Train LightGBM
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=250
)

# Validate predictions
val_predictions = lgb_model.predict(val_features)
val_predictions = val_predictions.argmax(axis=1)  # Convert probabilities to class indices
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"LightGBM Validation Accuracy: {val_accuracy}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.091290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11368
[LightGBM] [Info] Number of data points in the train set: 753470, number of used features: 61
[LightGBM] [Info] Start training from score -1.397299
[LightGBM] [Info] Start training from score -1.367918
[LightGBM] [Info] Start training from score -1.520861
[LightGBM] [Info] Start training from score -1.771027
[LightGBM] [Info] Start training from score -2.212582
LightGBM Validation Accuracy: 0.4804365052944348


In [15]:
catboost_model = CatBoostClassifier(
    iterations=50,
    learning_rate=0.01,
    depth=10,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    model_size_reg=1,
    random_seed=42,
    use_best_model=True,
    verbose=100
)

# Train CatBoost
catboost_model.fit(
    train_features, train_labels,
    eval_set=(val_features, val_labels),
    early_stopping_rounds=50
)

# Validate predictions
val_predictions = catboost_model.predict(val_features)
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"CatBoost Validation Accuracy: {val_accuracy}")

0:	learn: 0.3918683	test: 0.3341010	best: 0.3341010 (0)	total: 3.14s	remaining: 2m 33s
49:	learn: 0.4900261	test: 0.3892607	best: 0.3895023 (46)	total: 2m 8s	remaining: 0us

bestTest = 0.3895022539
bestIteration = 46

Shrink model to first 47 iterations.
CatBoost Validation Accuracy: 0.3895022539370464


In [16]:
tree = DecisionTreeClassifier()
tree.fit(train_features, train_labels)

# Calculate probabilities
val_predictions = tree.predict(val_features)
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"DecisionTree Validation Accuracy: {val_accuracy}")

DecisionTree Validation Accuracy: 0.5965041455992418


In [17]:
# Step 1: Predict Probabilities for Validation Set
# tabnet_probs = tabnet_model.predict_proba(val_features)
xgb_probs = xgb_model.predict_proba(val_features)
lgb_probs = lgb_model.predict(val_features)  # LightGBM probabilities
#catboost_probs = catboost_model.predict_proba(val_features)
tree_probs = tree.predict_proba(val_features)

# Step 2: Combine Probabilities by Multiplying
combined_probs = xgb_probs * lgb_probs * tree_probs #* catboost_probs
combined_probs = combined_probs / combined_probs.sum(axis=1, keepdims=True)  # Normalize to ensure valid probabilities

# Step 3: Make Final Predictions
final_predictions = np.argmax(combined_probs, axis=1)

# Step 4: Evaluate Combined Model
val_accuracy = accuracy_score(val_labels, final_predictions)
print(f"Combined Model Validation Accuracy: {val_accuracy}")

# Step 5: Predict Probabilities for Test Set
test_dataset = TabularDataset(csv_file=test_csv) # Load the test data

# tabnet_test_probs = tabnet_model.predict_proba(test_features)
xgb_test_probs = xgb_model.predict_proba(test_dataset.features)
lgb_test_probs = lgb_model.predict(test_dataset.features)  # LightGBM probabilities
#catboost_test_probs = catboost_model.predict_proba(test_dataset.features)
tree_test_probs = tree.predict_proba(test_dataset.features)

# Combine probabilities for the test set
test_combined_probs = xgb_test_probs * lgb_test_probs * tree_test_probs #* catboost_test_probs
test_combined_probs = test_combined_probs / test_combined_probs.sum(axis=1, keepdims=True)

# Step 6: Make Final Predictions for Test Set
test_final_predictions = np.argmax(test_combined_probs, axis=1)

# Decode predictions if needed (map numeric classes to original labels)
decoded_predictions = [inverse_class_mapping[pred] for pred in test_final_predictions]

# Step 7: Save Results to CSV
with open('test_results.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['row_index', 'piezo_groundwater_level_category'])  # Write header
    for id, pred_class in zip(test_dataset.data_frame['row_index'], decoded_predictions):
        writer.writerow([id, pred_class])  # Write each ID and predicted class

print("Test results saved to 'test_results.csv'.")

Combined Model Validation Accuracy: 0.5965041455992418
Test results saved to 'test_results.csv'.
