In [1]:
import os
import numpy as np
import pandas as pd
import csv
import torch
import torch.nn as nn
import sklearn
import xgboost as xgb

from torch import optim
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

print(pd.__version__)
print(np.__version__)
print(csv.__version__)
print(torch.__version__)
print(sklearn.__version__)

2.1.4
1.26.4
1.0
2.5.1+cu124
1.4.2


In [2]:
def load_by_chunks(path: str, chunk_size: int = 50_000, **kwargs) -> pd.DataFrame:
    chunks = pd.read_csv(path, chunksize=chunk_size, float_precision='round_trip', **kwargs)
    return pd.concat(chunks, ignore_index=True)

In [3]:
class_mapping = {
    "Very Low": 0,
    "Low": 1,
    "Average": 2,
    "High": 3,
    "Very High": 4
}

inverse_class_mapping = {v: k for k, v in class_mapping.items()}

In [4]:
train_csv = "X_train_clean.csv"
test_csv  = "X_test_clean.csv"

df = pd.read_csv(train_csv, nrows=50_000)

class_counts = df['piezo_groundwater_level_category'].value_counts()
print(class_counts)

# Calculate weights
total = class_counts.sum()
class_weights = class_counts / total

# Print the weights
print()
print(class_weights)
print()
print(1/class_weights)

piezo_groundwater_level_category
High         13306
Average      11623
Very High    11548
Low           8246
Very Low      5277
Name: count, dtype: int64

piezo_groundwater_level_category
High         0.26612
Average      0.23246
Very High    0.23096
Low          0.16492
Very Low     0.10554
Name: count, dtype: float64

piezo_groundwater_level_category
High         3.757703
Average      4.301815
Very High    4.329754
Low          6.063546
Very Low     9.475081
Name: count, dtype: float64


In [5]:
def drop_columns(df: pd.DataFrame, threshold=0.2, inplace=False) -> pd.DataFrame | None:
    """
    Drop columns with a fraction of more than `threshold` missing values.
    """
    threshold = int((1.0 - threshold) * len(df))
    return df.dropna(axis=1, thresh=threshold, inplace=inplace)

In [6]:
class TabularDataset(Dataset):
    def __init__(self, csv_file, label_column=None):
        self.data_frame = load_by_chunks(csv_file)
        self.label_column = label_column

        self.data_frame = self.data_frame.drop(columns=['meteo_radiation_IR'], errors='ignore')

        if self.label_column:
            self.data_frame[self.label_column] = self.data_frame[self.label_column].astype(str)
            self.labels = self.data_frame[self.label_column].map(class_mapping)
            self.labels = self.labels.astype(int).values
            self.features = self.data_frame.drop(columns=[self.label_column])
        else:
            self.features = self.data_frame
            self.labels = None

        self.encoders = {}
        for col in self.features.columns:
            if self.features[col].dtype == 'object' or not pd.api.types.is_numeric_dtype(self.features[col]):
                encoder = LabelEncoder()
                self.features[col] = encoder.fit_transform(self.features[col].astype(str))
                self.encoders[col] = encoder

        self.features = self.features.apply(pd.to_numeric, errors='coerce')
        self.features = self.features.fillna(self.features.mean())
        self.features = self.features.values

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float32)
        if self.labels is not None:
            label = torch.tensor(self.labels[idx], dtype=torch.long)
            return features, label
        return features


In [7]:
def vector_to_class(x):
  y = torch.argmax(x,axis=1)
  return y

def prediction_accuracy(predict,labels):
  accuracy = (predict == labels).sum()/(labels.shape[0])
  return accuracy

## Séparation en données d'entraînement et en données de validation

De 30% du jeu d'entraînement (`X_train_Hi5.csv`), on extrait trois jeux de données :
- `train_temp.csv`, `val.csv` : données d'entraînement et de validation pour les tests initiaux
- `train.csv` : données d'entraînement pour la soumission finale

In [85]:
def split_csv_by_fraction(
        input_file,
        train_temp_output,
        val_output,
        train_output,
    ):
    chunks = pd.read_csv(input_file, chunksize=50_000, float_precision='round_trip')
    # Retenir 30% des données de chaque morceau (chunk).
    df = pd.concat(
        (train_test_split(chunk, train_size=0.7, random_state=42)[1] for chunk in chunks),
        ignore_index=True,
    )
    df['piezo_measurement_date'] = pd.to_datetime(df['piezo_measurement_date'])
    month = df['piezo_measurement_date'].dt.month
    year = df['piezo_measurement_date'].dt.year

    # été 2020 + mai, octobre 2020-2023 : données d'entraînement pour la validation
    train_temp_df = df[
        ((year == 2020) & (month >= 6) & (month <= 9))
        | ((month == 5) | (month == 10))
    ].sample(frac=1, random_state=42)

    # été 2021 + avril, novembre 2020-2023 : données de validation pour les tests initiaux
    val_df = df[
        ((year == 2021) & (month >= 6) & (month <= 9))
        | ((month == 4) | (month == 11))
    ].sample(frac=1, random_state=42)

    # étés 2020-2021 + mai, octobre 2020-2023 : données d'entraînement pour la soumission
    train_df = df[
        (((year == 2020) | (year == 2021)) & (month >= 6) & (month <= 9))
        | ((month == 5) | (month == 10))
    ].sample(frac=1, random_state=42)

    print(
        'Train set for validation:', train_temp_df.shape,
        'Validation set:', val_df.shape,
        'Train set for submission:', train_df.shape,
    )
    train_temp_df.to_csv(train_temp_output, index=False)
    val_df.to_csv(val_output, index=False)
    train_df.to_csv(train_output, index=False)
    print(f"Saved {train_temp_output}, {val_output} and {train_output}")

In [86]:
split_csv_by_fraction(train_csv, "train_temp.csv", "val.csv", "train.csv")

Train set for validation: (255294, 136) Validation set: (251175, 136) Train set for submission: (348042, 136)
Saved train_temp.csv, val.csv and train.csv


## Score de validation pour les tests initiaux

In [87]:
train_csv_temp = "./train_temp.csv"
val_csv = "./val.csv"
train_dataset = TabularDataset(csv_file=train_csv_temp, label_column='piezo_groundwater_level_category')
val_dataset = TabularDataset(csv_file=val_csv, label_column='piezo_groundwater_level_category')

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# TODO: cross-validation max_depth, alpha, learning_rate (0, 1), max_delta_step (0, 10)
#objective="multi:softprob"
xgb_model = xgb.XGBClassifier(device="gpu", tree_method="gpu_hist", reg_alpha=0.5)

xgb_model.fit(train_dataset.features, train_dataset.labels)

# Validate the model
val_preds = xgb_model.predict(val_dataset.features)
val_accuracy = prediction_accuracy(val_dataset.labels, val_preds)
print(f'Validation accuracy: {val_accuracy:.3f}')

Validation accuracy: 0.472


## Entraînement pour la soumission finale

In [88]:
# Finalement, on n'utilise pas le jeu d'entraînement entier car cela réduit la précision du modèle
train_csv = "./train_temp.csv"
train_dataset = TabularDataset(csv_file=train_csv, label_column='piezo_groundwater_level_category')

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# TODO: cross-validation max_depth, alpha, learning_rate (0, 1), max_delta_step (0, 10)
#objective="multi:softprob"
xgb_model = xgb.XGBClassifier(device="gpu", tree_method="gpu_hist", reg_alpha=0.5)

xgb_model.fit(train_dataset.features, train_dataset.labels)

# Valider le model (valeur non pertinente, ne sert que pour vérifier que tout se passe bien)
val_preds = xgb_model.predict(val_dataset.features)
val_accuracy = prediction_accuracy(val_dataset.labels, val_preds)
print(f'Validation accuracy (sanity check): {val_accuracy:.3f}')

Validation accuracy (sanity check): 0.534


In [89]:
def save_predictions(model, outfile='test_results.csv'):
    test_dataset = TabularDataset(csv_file='./X_test_Hi5.csv')
    test_preds = model.predict(test_dataset.features)
    
    # Save results to CSV
    decoded_preds = [inverse_class_mapping[pred] for pred in test_preds]
    with open(outfile, mode='w', newline='') as file:
        writer = csv.writer(file)
        # Write header
        writer.writerow(['row_index', 'piezo_groundwater_level_category'])
        for id, pred_class in zip(test_dataset.data_frame['row_index'], decoded_preds):
            writer.writerow([id, pred_class])  # Write each ID and predicted class

In [90]:
save_predictions(xgb_model)

In [91]:
xgb_model.save_model('xgb_model.json')