<a href="https://colab.research.google.com/github/sandadi-pranavi/pipeline-1/blob/main/pipeline_1_ipynbd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os


In [7]:
# Replace with your actual CSV path
data_path = "/content/raw_data.csv"
df = pd.read_csv(data_path)

# View first few rows
print(df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [11]:
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
print(numerical_features)
print(categorical_features)

['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [12]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),        # Handle missing values
    ('scaler', StandardScaler())                         # Normalize data
])
print(numerical_pipeline)

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())])


In [13]:
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values
    ('encoder', OneHotEncoder(handle_unknown='ignore'))    # One-hot encode categories
])
print(categorical_pipeline )

Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore'))])


In [14]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])
print(preprocessor)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['PassengerId', 'Survived', 'Pclass', 'Age',
                                  'SibSp', 'Parch', 'Fare']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['Name', 'Sex', 'Ticket', 'Cabin',
                                  'Embarked'])])


In [22]:
# Apply transformations and get preprocessed data
processed_data = preprocessor.fit_transform(df)
print(processed_data)
# If you want to convert back to DataFrame:
processed_df = pd.DataFrame(processed_data.toarray() if hasattr(processed_data, "toarray") else processed_data)
print(processed_df)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10514 stored elements and shape (891, 1731)>
  Coords	Values
  (0, 0)	-1.730107956920867
  (0, 1)	-0.7892723355548015
  (0, 2)	0.8273772438659699
  (0, 3)	-0.5924805998028931
  (0, 4)	0.4327933656785018
  (0, 5)	-0.4736736092984604
  (0, 6)	-0.5024451714361923
  (0, 115)	1.0
  (0, 899)	1.0
  (0, 1423)	1.0
  (0, 1628)	1.0
  (0, 1730)	1.0
  (1, 0)	-1.7262200738716067
  (1, 1)	1.266989801811655
  (1, 2)	-1.5661069258891576
  (1, 3)	0.6387890120425208
  (1, 4)	0.4327933656785018
  (1, 5)	-0.4736736092984604
  (1, 6)	0.7868452935884461
  (1, 197)	1.0
  (1, 898)	1.0
  (1, 1496)	1.0
  (1, 1662)	1.0
  (1, 1728)	1.0
  (2, 0)	-1.7223321908223463
  :	:
  (888, 1730)	1.0
  (889, 0)	1.7262200738716067
  (889, 1)	1.266989801811655
  (889, 2)	-1.5661069258891576
  (889, 3)	-0.2846631968415396
  (889, 4)	-0.47454519624983954
  (889, 5)	-0.4736736092984604
  (889, 6)	-0.04438103794142432
  (889, 88)	1.0
  (889, 899)	1.0
  (889, 908)	1.0
  (8

In [23]:
# Create output folder if not exists
os.makedirs("output", exist_ok=True)

# Save processed data to CSV
processed_df.to_csv("output/processed_data.csv", index=False)
print("ETL pipeline completed and data saved!")


ETL pipeline completed and data saved!
