In [1]:
import pandas as pd
import numpy as np
import sklearn

print(pd.__version__)
print(np.__version__)
print(sklearn.__version__)



1.3.4
1.26.4
0.24.2


In [2]:
def load_data(path):
    data = pd.read_csv(path)
    return data

train_data = load_data('train.csv')
test_data = load_data('test.csv')

In [3]:
def fill_age(row, age_median):
    if pd.isnull(row["Age"]):
        return age_median[row["Sex"], row["Pclass"]]
    else:
        return row["Age"]

In [4]:
def compute_missing_params(train):
    params = {}
    age_imputed          = train.groupby(["Sex", "Pclass"])
    params["age_median"] = age_imputed["Age"].median()
    params["Embarked"]   = train["Embarked"].mode()[0]
    params["Fare"]       = train["Fare"].median()

    return params

In [5]:
def preprocess_data(train, test, params):
    full = pd.concat([train, test], ignore_index=True)
    
    full["sex_bin"]  = full["Sex"].map({"male":0,"female":1})
    full["Age"]      = full.apply(fill_age, args=(params["age_median"],), axis=1)
    
    full["Embarked"].fillna(params["Embarked"], inplace=True)
    full["Fare"].fillna(params["Fare"], inplace=True)
    
    embark_series    = pd.get_dummies(full["Embarked"], prefix="Embarked")
    full             = pd.concat([full, embark_series], axis=1)
    full["FamilySize"] = full["SibSp"]+full["Parch"]+1
    full["IsAlone"]   = (full["FamilySize"] == 1).astype(int)
    full["IsChild"]   = (full["Age"] < 16).astype(int)
    
    full.drop(columns=["Name", "Sex", "Ticket", "Cabin", "SibSp", "Parch","Embarked"], inplace=True)
    
    train_clean      = full.iloc[:len(train)]
    test_clean       = full.iloc[len(train):]
    
    return train_clean, test_clean

In [6]:
def get_features(df):
    features=[
        "Pclass","Age","Fare","sex_bin",
        "Embarked_C","Embarked_Q","Embarked_S",
        "FamilySize","IsAlone","IsChild"
    ]
    return df[features]

In [7]:
from sklearn.linear_model import LogisticRegression as LR

def train_lr(X, y, C=1.0):
    model = LR(max_iter=500, C=C)
    model.fit(X, y)
    return model

In [8]:
from sklearn.ensemble import RandomForestClassifier as RF

def train_rf(X, y, **kwargs):
    model = RF(random_state=42, **kwargs)
    model.fit(X, y)
    return model

In [9]:
params = compute_missing_params(train_data)

X_train, X_test = preprocess_data(train_data, test_data, params)
passenger_ids = X_test["PassengerId"]
X_train = get_features(X_train)
X_test  = get_features(X_test)
y_train = train_data["Survived"].astype(int)

print(X_train.shape)
print(X_test.shape)
print(X_train.isnull().sum().sum())

(891, 10)
(418, 10)
0


In [10]:
rf_model = train_rf(X_train, y_train, n_estimators=300, min_samples_leaf=5)

In [11]:
def make_submission(model, X_test, passenger_ids, filename):
    preds = model.predict(X_test)
    sub   = pd.DataFrame({"PassengerId": passenger_ids, "Survived":preds.astype(int)})
    sub.to_csv(filename, index=False)

In [12]:
make_submission(rf_model, X_test, passenger_ids, "./submission_04022026_iter4.csv")