In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
data = pd.read_csv('covtype.csv')

# Define features and target
X = data.drop('label', axis=1)
y = data['label']

# Define numerical and one-hot encoded features
numeric_features = [
    "elevation",
    "aspect",
    "slope",
    "horizontal_dist_to_hydrology",
    "vertical_dist_to_hydrology",
    "horizontal_dist_to_roadways",
    "hillshade_9am",
    "hillshade_noon",
    "hillshade_3pm",
    "horizontal_dist_to_fire_points"
]

onehot_encoded_features = [
    "wilderness_area_1", "wilderness_area_2", "wilderness_area_3", "wilderness_area_4",
    "soil_type_1", "soil_type_2", "soil_type_3", "soil_type_4", "soil_type_5", "soil_type_6",
    "soil_type_7", "soil_type_8", "soil_type_9", "soil_type_10", "soil_type_11", "soil_type_12",
    "soil_type_13", "soil_type_14", "soil_type_15", "soil_type_16", "soil_type_17", "soil_type_18",
    "soil_type_19", "soil_type_20", "soil_type_21", "soil_type_22", "soil_type_23", "soil_type_24",
    "soil_type_25", "soil_type_26", "soil_type_27", "soil_type_28", "soil_type_29", "soil_type_30",
    "soil_type_31", "soil_type_32", "soil_type_33", "soil_type_34", "soil_type_35", "soil_type_36",
    "soil_type_37", "soil_type_38", "soil_type_39", "soil_type_40"
]

# Preprocessing pipeline for numerical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('onehot', 'passthrough', onehot_encoded_features)])

# Stratified split to ensure class distribution in train and test sets
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Fit and transform the training data, transform the test data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convert transformed arrays back to DataFrames
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=preprocessor.get_feature_names_out())
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=preprocessor.get_feature_names_out())

# Add the target column back to the transformed data
train_df = pd.concat([X_train_transformed_df, y_train.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test_transformed_df, y_test.reset_index(drop=True)], axis=1)

# Save the preprocessed data to CSV files
train_df.to_csv('train_preprocessed.csv', index=False)
test_df.to_csv('test_preprocessed.csv', index=False)