In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import os

In [2]:
# Step 1: Extract
def load_data(file_path):
    print(f"Loading data from: {file_path}")
    return pd.read_csv(file_path)

In [6]:
#  Step 2: Transform
def preprocess_data(df):
    print("Starting preprocessing...")

    # Separate features and target (example: assume last column is target)
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    # Identify column types
    num_cols = X.select_dtypes(include=['int64', 'float64']).columns
    cat_cols = X.select_dtypes(include=['object']).columns

    # Pipelines for transformation
    numeric_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine them into a ColumnTransformer
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_pipeline, num_cols),
        ('cat', categorical_pipeline, cat_cols)
    ])

    # Apply the transformations
    X_transformed = preprocessor.fit_transform(X)

    return X_transformed, y, preprocessor


In [7]:
# Step 3: Load
def save_transformed_data(X, y, output_dir='output'):
    print("Saving transformed data...")
    os.makedirs(output_dir, exist_ok=True)

    X_df = pd.DataFrame(X.toarray() if hasattr(X, 'toarray') else X)
    y_df = pd.DataFrame(y)

    X_df.to_csv(os.path.join(output_dir, 'features.csv'), index=False)
    y_df.to_csv(os.path.join(output_dir, 'labels.csv'), index=False)
    print(f"Data saved in '{output_dir}' directory.")

In [8]:
# Main Pipeline Function
def run_etl_pipeline(input_csv):
    df = load_data(input_csv)
    X, y, _ = preprocess_data(df)
    save_transformed_data(X, y)

In [10]:
#  Run the Pipeline
if __name__ == "__main__":
    # Replace this with your own data file
    run_etl_pipeline("/content/train.csv")


Loading data from: /content/train.csv
Starting preprocessing...
Saving transformed data...
Data saved in 'output' directory.
