In [1]:
# Stage 1: Preprocess CSV -> Feather/Parquet with logging
import os, sys, time, json, logging, importlib, subprocess, gc
from typing import Dict
import numpy as np
import pandas as pd

# --- Logging setup ---
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s [%(levelname)s] %(message)s',
                    handlers=[
                        logging.FileHandler('run_preprocess.log', mode='w'),
                        logging.StreamHandler(sys.stdout)
                    ])
os.environ['PYTHONUNBUFFERED'] = '1'

def ensure_package(pkg: str):
    try:
        return importlib.import_module(pkg)
    except ImportError:
        logging.info(f'Installing {pkg}...')
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])
        return importlib.import_module(pkg)

# Feather requires pyarrow
ensure_package('pyarrow')

t0 = time.time()
logging.info('Reading small head to infer columns...')
head = pd.read_csv('train.csv', nrows=5)
cols = head.columns.tolist()
assert 'Cover_Type' in cols, 'Cover_Type not found in train.csv'
has_id = 'Id' in cols

# --- Build dtype map ---
dtypes: Dict[str, str] = {}
for c in cols:
    if c == 'Cover_Type':
        dtypes[c] = 'int8'
    elif c == 'Id':
        dtypes[c] = 'int32'
    elif c.startswith('Wilderness_Area_') or c.startswith('Soil_Type_'):
        dtypes[c] = 'uint8'
    elif c in ['Hillshade_9am','Hillshade_Noon','Hillshade_3pm']:
        dtypes[c] = 'uint16'  # safe; will downcast later if needed
    elif c in ['Elevation','Aspect','Slope',
               'Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology',
               'Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Fire_Points']:
        dtypes[c] = 'int32'  # will downcast post-read
    else:
        # default safe int32
        dtypes[c] = 'int32'

logging.info('Reading full train with dtypes...')
train = pd.read_csv('train.csv', dtype=dtypes)
logging.info(f'train shape: {train.shape}')

logging.info('Reading full test with dtypes...')
test = pd.read_csv('test.csv', dtype={k: v for k, v in dtypes.items() if k != 'Cover_Type'})
logging.info(f'test shape: {test.shape}')

# Downcast numerics to reduce memory
def downcast_df(df: pd.DataFrame) -> pd.DataFrame:
    for c in df.columns:
        if pd.api.types.is_integer_dtype(df[c]):
            df[c] = pd.to_numeric(df[c], downcast='integer')
        elif pd.api.types.is_float_dtype(df[c]):
            df[c] = pd.to_numeric(df[c], downcast='float')
    return df

train = downcast_df(train)
test = downcast_df(test)
logging.info('Downcast complete')

# --- Target and base features ---
y = (train['Cover_Type'].values.astype(np.int8) - 1)  # 0..6
feature_cols = [c for c in train.columns if c not in ['Cover_Type']]
if 'Id' in feature_cols:
    feature_cols.remove('Id')
X = train[feature_cols].copy()
X_test = test[[c for c in test.columns if c != 'Id']].copy()
test_ids = test['Id'].values if 'Id' in test.columns else np.arange(len(test), dtype=np.int64)

# --- Feature Engineering (exactly as in main notebook) ---
logging.info('Feature engineering...')
# 1) Distribution shift feature on Elevation (threshold from test)
elev_threshold = X_test['Elevation'].median()
X['is_high_elevation'] = (X['Elevation'] > elev_threshold).astype(np.int8)
X_test['is_high_elevation'] = (X_test['Elevation'] > elev_threshold).astype(np.int8)

# 2) Hydrology features
if set(['Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology']).issubset(X.columns):
    X['Hydrology_Euclid'] = np.sqrt((X['Horizontal_Distance_To_Hydrology'].astype(np.float32))**2 + (X['Vertical_Distance_To_Hydrology'].astype(np.float32))**2).astype(np.float32)
    X_test['Hydrology_Euclid'] = np.sqrt((X_test['Horizontal_Distance_To_Hydrology'].astype(np.float32))**2 + (X_test['Vertical_Distance_To_Hydrology'].astype(np.float32))**2).astype(np.float32)
    X['Elev_minus_VertHydro'] = (X['Elevation'].astype(np.float32) - X['Vertical_Distance_To_Hydrology'].astype(np.float32)).astype(np.float32)
    X_test['Elev_minus_VertHydro'] = (X_test['Elevation'].astype(np.float32) - X_test['Vertical_Distance_To_Hydrology'].astype(np.float32)).astype(np.float32)

# 3) Hillshade features
hill_cols = [c for c in ['Hillshade_9am','Hillshade_Noon','Hillshade_3pm'] if c in X.columns]
if len(hill_cols) == 3:
    X['Hillshade_Mean'] = X[hill_cols].mean(axis=1).astype(np.float32)
    X_test['Hillshade_Mean'] = X_test[hill_cols].mean(axis=1).astype(np.float32)
    X['Hillshade_Min'] = X[hill_cols].min(axis=1).astype(np.float32)
    X_test['Hillshade_Min'] = X_test[hill_cols].min(axis=1).astype(np.float32)
    X['Hillshade_Max'] = X[hill_cols].max(axis=1).astype(np.float32)
    X_test['Hillshade_Max'] = X_test[hill_cols].max(axis=1).astype(np.float32)
    X['Hillshade_Range'] = (X['Hillshade_Max'] - X['Hillshade_Min']).astype(np.float32)
    X_test['Hillshade_Range'] = (X_test['Hillshade_Max'] - X_test['Hillshade_Min']).astype(np.float32)

# 4) Distance interactions
dist_cols = ['Horizontal_Distance_To_Fire_Points','Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Hydrology']
if set(dist_cols).issubset(X.columns):
    hf, rr, hh = dist_cols
    X['DistDiff_Fire_Road'] = (X[hf] - X[rr]).abs().astype(np.int32)
    X_test['DistDiff_Fire_Road'] = (X_test[hf] - X_test[rr]).abs().astype(np.int32)
    X['DistDiff_Fire_Hydro'] = (X[hf] - X[hh]).abs().astype(np.int32)
    X_test['DistDiff_Fire_Hydro'] = (X_test[hf] - X_test[hh]).abs().astype(np.int32)
    X['DistDiff_Road_Hydro'] = (X[rr] - X[hh]).abs().astype(np.int32)
    X_test['DistDiff_Road_Hydro'] = (X_test[rr] - X_test[hh]).abs().astype(np.int32)
    X['DistMean_FRH'] = ((X[hf].astype(np.float32) + X[rr].astype(np.float32) + X[hh].astype(np.float32)) / 3.0).astype(np.float32)
    X_test['DistMean_FRH'] = ((X_test[hf].astype(np.float32) + X_test[rr].astype(np.float32) + X_test[hh].astype(np.float32)) / 3.0).astype(np.float32)
    X['DistSum_FRH'] = (X[hf] + X[rr] + X[hh]).astype(np.int32)
    X_test['DistSum_FRH'] = (X_test[hf] + X_test[rr] + X_test[hh]).astype(np.int32)
    X['DistMin_FRH'] = X[[hf, rr, hh]].min(axis=1).astype(np.int32)
    X_test['DistMin_FRH'] = X_test[[hf, rr, hh]].min(axis=1).astype(np.int32)
    X['DistMax_FRH'] = X[[hf, rr, hh]].max(axis=1).astype(np.int32)
    X_test['DistMax_FRH'] = X_test[[hf, rr, hh]].max(axis=1).astype(np.int32)

# 5) One-hot counts
soil_cols = [c for c in X.columns if c.startswith('Soil_Type_')]
wild_cols = [c for c in X.columns if c.startswith('Wilderness_Area_')]
if soil_cols:
    X['Soil_Type_Count'] = X[soil_cols].sum(axis=1).astype(np.int16)
    X_test['Soil_Type_Count'] = X_test[soil_cols].sum(axis=1).astype(np.int16)
if wild_cols:
    X['Wilderness_Area_Count'] = X[wild_cols].sum(axis=1).astype(np.int16)
    X_test['Wilderness_Area_Count'] = X_test[wild_cols].sum(axis=1).astype(np.int16)

# 6) Aspect sin/cos
if 'Aspect' in X.columns:
    X['Aspect_sin'] = np.sin(np.deg2rad(X['Aspect'].astype(np.float32))).astype(np.float32)
    X_test['Aspect_sin'] = np.sin(np.deg2rad(X_test['Aspect'].astype(np.float32))).astype(np.float32)
    X['Aspect_cos'] = np.cos(np.deg2rad(X['Aspect'].astype(np.float32))).astype(np.float32)
    X_test['Aspect_cos'] = np.cos(np.deg2rad(X_test['Aspect'].astype(np.float32))).astype(np.float32)

features = X.columns.tolist()
logging.info(f'Final feature count: {len(features)}')

# Save outputs
logging.info('Saving processed datasets to Feather/NumPy...')
X.reset_index(drop=True).to_feather('X.feather')
X_test.reset_index(drop=True).to_feather('X_test.feather')
np.save('y.npy', y)
np.save('test_ids.npy', test_ids)
with open('features.json', 'w') as f:
    json.dump(features, f)
with open('preprocess_meta.json', 'w') as f:
    json.dump({'elev_threshold': float(elev_threshold)}, f)

logging.info(f'Done. Elapsed: {time.time()-t0:.1f}s')
gc.collect()

2025-09-08 06:19:50,094 [INFO] Reading small head to infer columns...


2025-09-08 06:19:50,102 [INFO] Reading full train with dtypes...
