# ETL Data Pipeline

## Step 1: Import Libraries

## Step 2: Extract Data

## Step 3: Transform Data (Imputation + Scaling)

## Step 4: Load Data (Save to CSV)


In [1]:
# Import required libraries
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


In [2]:
# Simulate some data with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, None, 35, 40],
    'Salary': [50000, 60000, None, 80000]
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)


Original Data:
      Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob   NaN  60000.0
2  Charlie  35.0      NaN
3    David  40.0  80000.0


In [3]:
# Fill missing values using the mean
imputer = SimpleImputer(strategy='mean')
df[['Age', 'Salary']] = imputer.fit_transform(df[['Age', 'Salary']])

# Scale (normalize) the Age and Salary columns
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])

print("\nTransformed Data:")
print(df)


Transformed Data:
      Name       Age    Salary
0    Alice -1.543033 -1.234427
1      Bob  0.000000 -0.308607
2  Charlie  0.308607  0.000000
3    David  1.234427  1.543033


In [4]:
# Save the processed data to a CSV file
df.to_csv("processed_data.csv", index=False)
print("\nData saved as 'processed_data.csv'")



Data saved as 'processed_data.csv'
