In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

In [27]:
# Load the training and test data
train_data = pd.read_csv('/Users/apple/Documents/Projects/titanic-survival-prediction/data_1/train.csv')
test_data = pd.read_csv('/Users/apple/Documents/Projects/titanic-survival-prediction/data_1/test.csv')
test_y_data = pd.read_csv('/Users/apple/Documents/Projects/titanic-survival-prediction/data_1/gender_submission.csv')

In [24]:
test_data.columns


Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [25]:
test_data = pd.concat([test_data, test_y_data], axis=1, keys=['PassengerId', 'PassengerId_1'])

In [28]:
result = test_data.merge(test_y_data, on='PassengerId', how='outer')

In [29]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
# Data Preprocessing

# Define features and target variable
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']
X_test = test_data


In [4]:
# Define preprocessing for numerical and categorical features
numeric_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])


In [5]:
# Combine transformers using a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [6]:
# Create a preprocessing pipeline
preprocessor_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor)
])


In [7]:
# Fit and transform the training data
X_train_preprocessed = preprocessor_pipe.fit_transform(X_train)

# Transform the test data
X_test_preprocessed = preprocessor_pipe.transform(X_test)

In [8]:
# Optional: Convert the preprocessed data back to DataFrames
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=numeric_features + categorical_features)
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed, columns=numeric_features + categorical_features)

In [9]:
## apply label encodig on object classes 
le = LabelEncoder()
X_train_preprocessed_df['Sex'] = le.fit_transform(X_train_preprocessed_df['Sex'])
X_test_preprocessed_df['Sex'] = le.transform(X_test_preprocessed_df['Sex'])

le = LabelEncoder()
X_train_preprocessed_df['Embarked'] = le.fit_transform(X_train_preprocessed_df['Embarked'])
X_test_preprocessed_df['Embarked'] = le.transform(X_test_preprocessed_df['Embarked'])

In [15]:
# Now you can use X_train_preprocessed and X_test_preprocessed in your machine learning models.

# Save the preprocessed data if needed
X_train_preprocessed_df.to_csv('/Users/apple/Documents/Projects/titanic-survival-prediction/data/train_preprocessed.csv', index=False)
X_test_preprocessed_df.to_csv('/Users/apple/Documents/Projects/titanic-survival-prediction/data/test_preprocessed.csv', index=False)

# Verify the preprocessed datab
X_train_preprocessed_df.head()

Unnamed: 0,Age,Fare,Pclass,Sex,SibSp,Parch,Embarked
0,-0.592481,-0.502445,3,1,1,0,2
1,0.638789,0.786845,1,0,1,0,0
2,-0.284663,-0.488854,3,0,0,0,2
3,0.407926,0.42073,1,0,1,0,2
4,0.407926,-0.486337,3,1,0,0,2


In [13]:
# Optional: Convert the preprocessed data back to DataFrames
X_train_preprocessed_df.head()

Unnamed: 0,Age,Fare,Pclass,Sex,SibSp,Parch,Embarked,Survived
0,-0.592481,-0.502445,3,1,1,0,2,0
1,0.638789,0.786845,1,0,1,0,0,1
2,-0.284663,-0.488854,3,0,0,0,2,1
3,0.407926,0.42073,1,0,1,0,2,1
4,0.407926,-0.486337,3,1,0,0,2,0
