<a href="https://colab.research.google.com/github/ramachandran-ds/Data_Pipeline_Developement-/blob/main/Data_Pipeline_Development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import Required Libraries
import seaborn as sns                                             # We use "as" command to refer the library using short name.
import pandas as pd
from sklearn.pipeline import Pipeline                             # We use sklearn(Sci-kit learn) library for Data Preprocessing and feature engineering
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split              # It will imports train_test_split to split data into training and test sets.

In [None]:
df = sns.load_dataset('titanic')            # Here we use Seaborn's in-built Dataset.so we can directly load from seaborn using load_dataset
print("Initial DataFrame shape:", df.shape) # Shape refers "number of rows and columns"
df.head()                                   # We use head() for quick inspection.it shows first 5 rows. if we want to check last 5 rows then we can use tail().

Initial DataFrame shape: (891, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
# Here "Survived" is our target variable.we should seperate the features & Target
df = df.dropna(subset=['survived'])     # It will drop rows incase of missing target
X = df.drop(['survived'], axis=1)       # It will take all features
y = df['survived']                      # It will take our target

In [None]:
# List of categorical and numeric columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number', 'bool']).columns.tolist()

# Handle 'who', 'embarked', and other string columns - drop columns irrelevant for this task
drop_cols = ['deck', 'embark_town', 'alive']
X = X.drop(columns=[col for col in drop_cols if col in X.columns])

# Redefine column lists after dropping
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number', 'bool']).columns.tolist()

In [None]:
# Numeric features transformer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),          # Here imputer fills the all missing values with median(middle value) of that particular column
    ('scaler', StandardScaler())                            # The scaler used to standardize Numeric variables
])

# Categorical features transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),   # For Catagorial variables,We always choose the strategy "Mode" (frequent values)
    ('encoder', OneHotEncoder(handle_unknown='ignore'))     # Encoder will encode categories to binary vectors(0,1) and ignores categories not seen during training.
])

In [None]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(                                                         # It will combine both numeric & catagorial pipelines to one column transformer and applies numeric & catagorial transformations respectively
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Split the data for demonstration purpose
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Here We use 80% data for training & 20% data for testing. We use random_state to ensure the split is reproducible

In [None]:
# Full pipeline
etl_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor)
])

# Fit the Preprocessing steps on training set and transform. apply the same to test set.
X_train_processed = etl_pipeline.fit_transform(X_train)
X_test_processed = etl_pipeline.transform(X_test)
print("Transformed train shape:", X_train_processed.shape)
print("Transformed test shape:", X_test_processed.shape)

# Convert processed features to DataFrame
# Retrieve feature names after transformation
cat_features = etl_pipeline.named_steps['preprocessing'].named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)
all_features = numeric_cols + list(cat_features)
X_train_df = pd.DataFrame(X_train_processed, columns=all_features)
X_train_df.head()

Transformed train shape: (712, 18)
Transformed test shape: (179, 18)


Unnamed: 0,pclass,age,sibsp,parch,fare,adult_male,alone,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third,who_child,who_man,who_woman
0,-1.614136,1.253641,-0.470722,-0.479342,-0.078684,0.805076,0.812203,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-0.400551,-0.477284,-0.470722,-0.479342,-0.377145,0.805076,0.812203,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.813034,0.215086,-0.470722,-0.479342,-0.474867,0.805076,0.812203,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.813034,-0.246494,0.379923,-0.479342,-0.47623,0.805076,-1.231219,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.813034,-1.785093,2.93186,2.048742,-0.025249,-1.242118,-1.231219,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
