In [14]:
import pandas as pd

# Load the datasets
train_file_path = 'train.csv'
test_file_path = 'test.csv'
gender_submission_file_path = 'gender_submission.csv'

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)
gender_submission_df = pd.read_csv(gender_submission_file_path)

# Display the first few rows of the datasets to understand their structure
train_df.head(), test_df.head(), gender_submission_df.head()


(   PassengerId  Survived  Pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3         1       3   
 3            4         1       1   
 4            5         0       3   
 
                                                 Name     Sex   Age  SibSp  \
 0                            Braund, Mr. Owen Harris    male  22.0      1   
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
 2                             Heikkinen, Miss. Laina  female  26.0      0   
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
 4                           Allen, Mr. William Henry    male  35.0      0   
 
    Parch            Ticket     Fare Cabin Embarked  
 0      0         A/5 21171   7.2500   NaN        S  
 1      0          PC 17599  71.2833   C85        C  
 2      0  STON/O2. 3101282   7.9250   NaN        S  
 3      0            113803  53.1000  C123        S  
 4      0            373450   8.0500

In [15]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

# Define preprocessing steps
numeric_features = ['Age', 'Fare', 'SibSp', 'Parch']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the model pipeline with preprocessing and Logistic Regression
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Train the pipeline on the training data
model_pipeline.fit(X_train_split, y_train_split)

# Evaluate the model
y_val_pred_clean = model_pipeline.predict(X_val)
val_accuracy_clean = accuracy_score(y_val, y_val_pred_clean)

# Predict on the test set
test_predictions_clean = model_pipeline.predict(X_test)

# Prepare the submission file
submission_df_clean = test_df[['PassengerId']].copy()
submission_df_clean['Survived'] = test_predictions_clean

# Save the predictions to a CSV file
output_file_path_clean = 'titanic_predictions_clean_logistic.csv'
submission_df_clean.to_csv(output_file_path_clean, index=False)

val_accuracy_clean, output_file_path_clean


(0.8044692737430168, 'titanic_predictions_clean_logistic.csv')

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

# Define preprocessing steps
numeric_features = ['Age', 'Fare', 'SibSp', 'Parch']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

In [None]:
# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [None]:

# Create the model pipeline with preprocessing and Logistic Regression
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])


In [None]:

# Train the pipeline on the training data
model_pipeline.fit(X_train_split, y_train_split)

In [None]:

# Evaluate the model
y_val_pred_clean = model_pipeline.predict(X_val)
val_accuracy_clean = accuracy_score(y_val, y_val_pred_clean)

In [None]:
# Predict on the test set
test_predictions_clean = model_pipeline.predict(X_test)

In [None]:
# Prepare the submission file
submission_df_clean = test_df[['PassengerId']].copy()
submission_df_clean['Survived'] = test_predictions_clean


In [None]:
# Save the predictions to a CSV file
output_file_path_clean = 'titanic_predictions_clean_logistic.csv'
submission_df_clean.to_csv(output_file_path_clean, index=False)

val_accuracy_clean, output_file_path_clean