<a href="https://colab.research.google.com/github/sk4676-oss/MLDA/blob/main/T1_Data_Pre_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the Titanic dataset
# Make sure 'Titanic-Dataset.csv' is uploaded or provide the correct path
try:
    data = pd.read_csv('/content/Titanic-Dataset.csv', encoding="latin1")
except FileNotFoundError:
    print("Error: Titanic-Dataset.csv not found. Please upload the dataset or update the file path.")
    # You might want to exit or handle this error differently in a real script
    raise # Re-raise the error to stop execution if the file is not found

# Display the first few rows of the dataset
print(data.head())
print("\nDataset Info:")
data.info()
print("\nMissing values:")
print(data.isnull().sum())


# 1. Data Cleaning and Preprocessing for Titanic Data
# Define numerical and categorical features for the Titanic dataset
# Based on common columns in Titanic datasets, adjust if your dataset has different names
num_features = ['Age', 'Fare', 'SibSp', 'Parch']
cat_features = ['Pclass', 'Sex', 'Embarked'] # Pclass can be treated as categorical

# Filter features to only include columns present in the data
num_features_present = [col for col in num_features if col in data.columns]
cat_features_present = [col for col in cat_features if col in data.columns]


# Create preprocessing pipelines for numerical and categorical features
# Using mean imputation for numerical and most frequent for categorical
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps for features (X)
transformers = []
if num_features_present:
    transformers.append(("num", num_transformer, num_features_present))
if cat_features_present:
    transformers.append(("cat", cat_transformer, cat_features_present))

if not transformers:
    print("Error: No valid features found in the dataset for preprocessing.")
    # Handle case where no valid features are found
    # You might want to exit or handle this error differently
else:
    preprocessor = ColumnTransformer(
        transformers=transformers,
        remainder='drop' # Drop columns not specified in transformers
    )
    preprocessor.set_output(transform="pandas")

    # Apply the transformations to the features (X)
    # Ensure the target column is not included in the features for preprocessing
    target_column = 'Survived' # Assuming 'Survived' is the target variable
    if target_column in data.columns:
        X = data.drop(target_column, axis=1).copy() # Drop target and use .copy()
        data_preprocessed_X = preprocessor.fit_transform(X)
        print("\nData preprocessed (Features - X):")
        print(data_preprocessed_X.head())

        # Preprocess the target variable (y)
        # For logistic regression, the target is usually kept as is (0 or 1),
        # but we can convert it to numpy array and ensure correct type
        y = data[target_column].values # Get the target column values as numpy array
        print("\nTarget variable (y):")
        print(y[:5]) # Print first 5 values of the target


        # 3. Data Splitting
        # Ensure X and y are not empty
        if not data_preprocessed_X.empty and y.size > 0:
            X_train, X_test, y_train, y_test = train_test_split(data_preprocessed_X, y, test_size=0.2, random_state=42, stratify=y) # Use stratify for classification target

            # Display the first few rows of the processed training data
            print("\nX_train head:")
            print(X_train.head())
            print("\ny_train head:")
            print(y_train[:5]) # Print first 5 values of y_train
        else:
             print("Error: Preprocessed data (X or y) is empty. Cannot perform data splitting.")

    else:
        print(f"Error: Target column '{target_column}' not found in the dataset. Cannot preprocess output or split data.")

# 2. Feature Engineering - Skipping Price_per_sqft as it's not relevant for Titanic data
# Add relevant feature engineering steps for Titanic data here if needed (e.g., creating 'Title' from 'Name')


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  

D