In [1]:
import numpy as np
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Step 1: Explorative data analysis

In [2]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


Step 2: Data preprocessing

In [3]:
train['Age'] = train['Age'].fillna(train['Age'].median())
train['Cabin'] = train['Cabin'].fillna('Missing')
train['Embarked'] = train['Embarked'].fillna('Missing')
test['Age'] = test['Age'].fillna(test['Age'].median())
test['Cabin'] = test['Cabin'].fillna('Missing')
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

In [4]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

target = 'Survived'
features = train.columns.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'])

cat_cols = ['Sex', 'Embarked']
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    le_dict[col] = le

X_train = train[features]
y_train = train[target]
X_test = test[features]

Step 3: Define and train model

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

param_dist = {
    'n_estimators': randint(100, 400),
    'max_depth': randint(3, 8),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'gamma': uniform(0, 2),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(1, 5)
}

model = XGBClassifier(eval_metric='logloss')
search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=500,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)
search.fit(X_train, y_train)

best_model = search.best_estimator_
print("Beste Parameter:", search.best_params_)
print("CV-Score:", search.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Beste Parameter: {'colsample_bytree': np.float64(0.819700811161039), 'gamma': np.float64(0.10952327764406267), 'learning_rate': np.float64(0.07703944832918018), 'max_depth': 6, 'n_estimators': 216, 'reg_alpha': np.float64(0.603721531730139), 'reg_lambda': np.float64(1.5713365002784387), 'subsample': np.float64(0.8243513752610883)}
CV-Score: 0.8428661100998054


Step 4: Submission

In [8]:
preds = best_model.predict(X_test)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': preds
})
submission.to_csv('submission.csv', index=False)