<a href="https://colab.research.google.com/github/rockfiller/titanic_analysis/blob/main/%E9%90%B5%E9%81%94%E5%B0%BC%E8%99%9F%E5%AD%98%E6%B4%BB%E9%A0%90%E6%B8%AC_(3).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import numpy as np
import pandas as pd

def preprocess_dataset(df):
    '''
    Preprocess the Titanic dataset by handling missing values and removing irrelevant columns.

    Parameters:
        df (DataFrame): Original dataset.

    Returns:
        DataFrame: Processed dataset with cleaned and imputed values.
    '''
    # Remove irrelevant and categorical columns
    df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

    # Fill missing values in the 'Age' column with the mean age of each Pclass group
    df['Age'] = df['Age'].fillna(df.groupby('Pclass')['Age'].transform('mean'))
    # Fill missing values in the 'Age' column with the mean age
    # df['Age'] = df['Age'].fillna(df['Age'].mean())

    # Fill missing values in the 'Embarked' column with the most frequent value (mode)
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    # One-Hot Encoding for the 'Sex' and 'Embarked' columns
    df = pd.get_dummies(df, columns=['Sex', 'Embarked'])

    # Create Fsize as a new numerical feature
    df['Fsize'] = df['SibSp'] + df['Parch'] + 1

    # Create Young as a new numerical feature
    df['Kid'] = (df['Age'] < 12).astype(int)
    return df

df = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv')
df_train = preprocess_dataset(df)

columns_X = df_train.drop(columns=['Survived']).columns
columns_y = ['Survived']

train_X = df_train[columns_X]
train_y = df_train[columns_y]

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log = LogisticRegression(random_state=0, max_iter=3000)
scores = cross_val_score(log, train_X, train_y.values.ravel(), cv=5, scoring='accuracy')
print(f'scores: {scores}')
print(f'average: {np.mean(scores)}')

scores: [0.79329609 0.80898876 0.79775281 0.7752809  0.83707865]
average: 0.8024794425961961
