In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Combine the datasets
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Define features and target
X = combined_data.drop('label', axis=1)
y = combined_data['label']

# Define numerical features (specify any numerical columns you want to scale)
numeric_features = ["time_1", "time_2", "rad_flow", "fpv_close", "fpv_open", "high", "bypass", "bpv_close", "bpv_open"]

# Preprocessing pipeline for numerical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('pass', 'passthrough', X.columns.difference(numeric_features))])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Fit and transform the training data, transform the test data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convert transformed arrays back to DataFrames
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=preprocessor.get_feature_names_out())
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=preprocessor.get_feature_names_out())

# Add the target column back to the transformed data
train_df = pd.concat([X_train_transformed_df, y_train.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test_transformed_df, y_test.reset_index(drop=True)], axis=1)

# Save the preprocessed data to CSV files
train_df.to_csv('shuttle_train.csv', index=False)
test_df.to_csv('shuttle_test.csv', index=False)
