**Extract (Data Ingestion)**

In [42]:
import pandas as pd

def load_dataset(data):
  file_extension = data.split('.')[-1].lower()

  if file_extension == 'csv':
    df = pd.read_csv(data)
  elif file_extension in['xls', 'xlsx']:
    df = pd.read_excel(data)
  elif file_extension == 'json':
    df = pd.read_json(data)
  else :
    raise ValueError(f"Unsupported file extension")

  return df



**Transform( Data Preprocessing & Feature Engineering)**

In [43]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA


def preprocess_data(df):
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    # Initial Data Validation
    print(f"Initial data shape: {df.shape}")
    print(f"Numerical columns: {numerical_cols}")
    print(f"Categorical columns: {categorical_cols}")

    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

    # Validate after imputation
    print("Missing values after imputation:", df.isnull().sum().sum())

    # Scale features of numerical columns
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # Encode categorical features
    df = pd.get_dummies(df, columns=categorical_cols)

    # Validate after preprocessing
    print(f"Data shape after preprocessing: {df.shape}")
    return df

def feature_engineering(df, target_column=None):
    # Separate input and target variables
    if target_column:
        X = df.drop(columns=[target_column])
        y = df[target_column]
    else:
        X = df
        y = None

    # Adding polynomial features
    poly_features = X ** 2
    poly_features.columns = [f"{col}^2" for col in X.columns]
    df = pd.concat([df, poly_features], axis=1)

    # Validate after adding polynomial features
    print(f"Number of features after polynomial expansion: {df.shape[1]}")

    # Dimensionality Reduction using PCA
    if X.shape[1] > 1:  # PCA requires more than one feature
        pca = PCA(n_components=2)
        pca_components = pca.fit_transform(X)
        df['PCA1'] = pca_components[:, 0]
        df['PCA2'] = pca_components[:, 1]
        print(f"Explained variance ratio by PCA: {pca.explained_variance_ratio_}")
    else:
        print("Not enough features for PCA.")

    # Validate after feature engineering
    print(f"Data shape after feature engineering: {df.shape}")
    return df


**Load Transformed Data**

In [44]:
import pandas as pd

def save_data(df, output):
    file_extension = output.split('.')[-1].lower()

    if file_extension == 'csv':
        df.to_csv(output, index=False)
    elif file_extension in ['xls', 'xlsx']:
        df.to_excel(output, index=False)
    elif file_extension == 'json':
        df.to_json(output, orient='records')
    else:
        raise ValueError(f"Unsupported file extension: {file_extension}")

    print(f"Data saved successfully to {output}")



**Orchestrate ETL Pipeline**

In [45]:
def etl_pipeline(input_data, output_data):

  df = load_dataset(input_data)
  df = preprocess_data(df)
  df = feature_engineering(df)
  save_data(df, output_data)
  print("ETL pipeline completed successfully")

**Usage**

In [46]:

input_data = "/content/iris_dataset.csv"
output_data = "/content/test_dataset.csv"

etl_pipeline(input_data, output_data)


Initial data shape: (150, 5)
Numerical columns: Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')
Categorical columns: Index(['target'], dtype='object')
Missing values after imputation: 0
Data shape after preprocessing: (150, 7)
Number of features after polynomial expansion: 14
Explained variance ratio by PCA: [0.69096693 0.21334576]
Data shape after feature engineering: (150, 16)
Data saved successfully to /content/test_dataset.csv
ETL pipeline completed successfully


**Input Dataset**

In [47]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/iris_dataset.csv")
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [48]:
df.isnull().sum()

Unnamed: 0,0
sepal length (cm),0
sepal width (cm),0
petal length (cm),0
petal width (cm),0
target,0


**Output Dataset**

In [49]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/test_dataset.csv")
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_Iris-setosa,target_Iris-versicolor,target_Iris-virginica,sepal length (cm)^2,sepal width (cm)^2,petal length (cm)^2,petal width (cm)^2,target_Iris-setosa^2,target_Iris-versicolor^2,target_Iris-virginica^2,PCA1,PCA2
0,-0.900681,1.032057,-1.341272,-1.312977,True,False,False,0.811227,1.065142,1.799012,1.723908,1,0,0,-2.398363,-0.539962
1,-1.143017,-0.124958,-1.341272,-1.312977,True,False,False,1.306488,0.015614,1.799012,1.723908,1,0,0,-2.220796,0.558427
2,-1.385353,0.337848,-1.398138,-1.312977,True,False,False,1.919202,0.114141,1.95479,1.723908,1,0,0,-2.489738,0.241342
3,-1.506521,0.106445,-1.284407,-1.312977,True,False,False,2.269604,0.011331,1.649701,1.723908,1,0,0,-2.427045,0.483157
4,-1.021849,1.26346,-1.341272,-1.312977,True,False,False,1.044175,1.596332,1.799012,1.723908,1,0,0,-2.51712,-0.699422


In [50]:
df.isnull().sum()

Unnamed: 0,0
sepal length (cm),0
sepal width (cm),0
petal length (cm),0
petal width (cm),0
target_Iris-setosa,0
target_Iris-versicolor,0
target_Iris-virginica,0
sepal length (cm)^2,0
sepal width (cm)^2,0
petal length (cm)^2,0
