In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

In [2]:
# Load the Iris dataset
data = load_iris()
X, y = data.data, data.target

In [3]:
# Convert to DataFrame for demonstration purposes
df = pd.DataFrame(X, columns=data.feature_names)
df['species'] = y

In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [5]:
# Artificially introduce missing values and categorical data for demonstration
df.iloc[0, 2] = np.nan  # introduce a missing value
df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

In [6]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('species', axis=1), df['species'], test_size=0.2, random_state=42)

In [8]:
# Define numerical
numerical_features = df.columns[:-1]  # all columns except 'species'

In [13]:
# Create a transformer for numerical features (handling missing values and scaling)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # fill missing values with mean
    ('scaler', StandardScaler())  # scale numerical features
])

In [14]:
# Apply the preprocessing to training data
X_train_preprocessed = numerical_transformer.fit_transform(X_train)

In [15]:
# For demonstration, convert the preprocessed data back to a DataFrame
# Since we're only dealing with numerical features here, we directly use them as column names
X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=numerical_features)

In [16]:
# Display the first few rows of the preprocessed training data
print(X_train_preprocessed.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0          -1.473937          1.203658          -1.585627         -1.312603
1          -0.133071          2.992376          -1.296934         -1.045633
2           1.085898          0.085709           0.377484          0.289218
3          -1.230143          0.756479          -1.239196         -1.312603
4          -1.717731          0.309299          -1.412412         -1.312603
