In [99]:
"""
Given the input data, the goal is to train the model to predict whether a passenger survived (1) or not (0) in the Titanic dataset.

Todo 
1. Load the data

2. Combine train and test data for unified preprocessing

3. Preprocess the combined data (missing values, encoding, drop columns, etc.)

4. Split the combined data back into train and test sets

5. Separate features (X) and target (y) from train set

6. Split train set into training and validation sets

7. Train and evaluate the model

8. Predict on test set and prepare submission


"""

'\nGiven the input data, the goal is to train the model to predict whether a passenger survived (1) or not (0) in the Titanic dataset.\n\nTodo \n1. Load the data\n\n2. Combine train and test data for unified preprocessing\n\n3. Preprocess the combined data (missing values, encoding, drop columns, etc.)\n\n4. Split the combined data back into train and test sets\n\n5. Separate features (X) and target (y) from train set\n\n6. Split train set into training and validation sets\n\n7. Train and evaluate the model\n\n8. Predict on test set and prepare submission\n\n\n'

In [100]:
!pip install xgboost lightgbm seaborn



In [101]:
import numpy as np
import pandas as pd

In [102]:
from sklearn.ensemble import GradientBoostingClassifier

In [103]:
import matplotlib.pyplot as plt
import seaborn as sns

In [104]:
#data preprocessing and model evaluation libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [105]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [106]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [107]:
train = pd.read_csv('../../12_data/titanic/train.csv')
test = pd.read_csv('../../12_data/titanic/test.csv')

In [108]:
print(train.shape)

(891, 12)


In [109]:
print(test.shape)

(418, 11)


In [110]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [129]:
train['TrainFlag'] = 1
test['TrainFlag'] = 0

In [130]:
# Add missing target column to test set for concatenation
test['Survived'] = None

In [131]:
# Combine train and test data for consistent preprocessing
df = pd.concat([train, test], sort=False)

In [114]:
display(df[:3])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,TrainFlag
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1


In [132]:
# Fill missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [133]:
# Map categorical to numeric
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [134]:
# Drop unused columns (strings that models can't handle)
df = df.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1)

In [135]:
print(df['Age'].isnull().sum())

0


In [136]:
print(df['Fare'].isnull().sum())

0


In [137]:
print(df['Embarked'].isnull().sum())

0


In [138]:
# Check missing values after filling
print(df.isnull().sum())

Survived     418
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
TrainFlag      0
dtype: int64


In [122]:
"""
# Note: After merging train and test datasets,
# the 'Survived' column shows 0 missing values in train data,
# but 418 missing values in test data,
# because the test set does not have target labels.
"""

"\n# Note: After merging train and test datasets,\n# the 'Survived' column shows 0 missing values in train data,\n# but 418 missing values in test data,\n# because the test set does not have target labels.\n"

In [139]:
# Split back into train and test
train_df = df[df['TrainFlag'] == 1].drop(['TrainFlag'], axis=1)
test_df = df[df['TrainFlag'] == 0].drop(['TrainFlag', 'Survived'], axis=1)

In [140]:
train_df = df[df['TrainFlag'] == 1]
print(train_df['Survived'].isnull().sum())

0


In [141]:
test_df = df[df['TrainFlag'] == 0]
print(test_df['Survived'].isnull().sum())

418


In [155]:
# Prepare inputs and targets
X = train_df.drop('Survived', axis=1)
y = train_df['Survived'].astype(int)

In [156]:
#Data Validation

In [157]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [158]:
#Gradient Boosting Classifier 

In [159]:
df = df.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1, errors='ignore')

In [160]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [161]:
print(y_train.isnull().sum())
print(y_train.unique())
print(y_train.dtype)

0
[0 1]
int64


In [162]:
#Accuracy

In [163]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")


Validation Accuracy: 0.8156


In [172]:
#Hyperparameter tuning 
"""
# Hyperparameter tuning is the process of searching for the best set of parameters 
# that control the learning process of a machine learning model.
# These parameters (called hyperparameters) are set by humans before training begins,
# such as the number of trees in a forest, tree depth, or minimum samples to split a node.
# By trying different combinations systematically (e.g., with GridSearchCV),
# we can find the configuration that yields the best model performance.

"""

'\n# Hyperparameter tuning is the process of searching for the best set of parameters \n# that control the learning process of a machine learning model.\n# These parameters (called hyperparameters) are set by humans before training begins,\n# such as the number of trees in a forest, tree depth, or minimum samples to split a node.\n# By trying different combinations systematically (e.g., with GridSearchCV),\n# we can find the configuration that yields the best model performance.\n\n'

In [173]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [174]:
param_grid = {
    'n_estimators': [50, 100, 200],        # Number of trees in the forest
    'max_depth': [None, 5, 10, 20],       # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],      # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]          # Minimum number of samples required to be at a leaf node
}


In [178]:
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [179]:
print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


Best params: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best score: 0.8300305328474344


In [169]:
y_pred = best_model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))


Validation Accuracy: 0.8100558659217877


In [180]:
#Conclusion

In [181]:
"""
The best_score_ of 83% from GridSearchCV
is the average score obtained during cross-validation,
where the training data is split into several parts,
and the model is evaluated on parts it hasn't seen within that training data.
However, since this evaluation is still done within the training dataset,
it can be somewhat optimistic compared to completely new, unseen data.

On the other hand,

the Validation Accuracy of 81%
is the score measured on a separate validation set,
which the model has never seen during training,
so it is considered a more realistic indicator of the model’s true performance.

"""

"\nThe best_score_ of 83% from GridSearchCV\nis the average score obtained during cross-validation,\nwhere the training data is split into several parts,\nand the model is evaluated on parts it hasn't seen within that training data.\nHowever, since this evaluation is still done within the training dataset,\nit can be somewhat optimistic compared to completely new, unseen data.\n\nOn the other hand,\n\nthe Validation Accuracy of 81%\nis the score measured on a separate validation set,\nwhich the model has never seen during training,\nso it is considered a more realistic indicator of the model’s true performance.\n\n"

In [183]:
# improve model's functionaity =RandomizedSearchCV

In [198]:
"""
RandomizedSearchCV : randomly samples combinations from these distributions
->evaluates only a subset of all possible combinations for efficiency.
to find the hyperparameter combination that yields the best model performance.
"""

'\nRandomizedSearchCV : randomly samples combinations from these distributions\n->evaluates only a subset of all possible combinations for efficiency.\nto find the hyperparameter combination that yields the best model performance.\n'

In [199]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

In [200]:
# Hyperparameter ranges for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(50, 200),    # number of trees
    'max_depth': [None, 5, 10, 20],      # tree max depth
    'min_samples_split': randint(2, 11), # min samples to split
    'min_samples_leaf': randint(1, 5)    # min samples at leaf
}


In [201]:
rf = RandomForestClassifier(random_state=42)

In [202]:
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,               # 시도할 조합 수
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy'
)

In [203]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [204]:
print("Best params:", random_search.best_params_)
print("Best score:", random_search.best_score_)

Best params: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 133}
Best score: 0.8271840835221116


In [205]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_val)
from sklearn.metrics import accuracy_score
print("Validation Accuracy:", accuracy_score(y_val, y_pred))


Validation Accuracy: 0.8156424581005587


In [196]:
if 'Survived' in test_df.columns:
    test_df = test_df.drop('Survived', axis=1)

In [195]:
test_preds = best_model.predict(test_df)
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': test_preds})
submission.to_csv('submission.csv', index=False)