In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# 1. Load Data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 2. Feature Engineering

In [6]:
train['Title'] = train['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

In [7]:
title_map = {
    "Mr": "Mr", "Miss": "Miss", "Mrs": "Mrs", "Master": "Master",
    "Dr": "Rare", "Rev": "Rare", "Col": "Rare", "Major": "Rare", "Mlle": "Miss",
    "Countess": "Rare", "Ms": "Miss", "Lady": "Rare", "Jonkheer": "Rare",
    "Don": "Rare", "Dona": "Rare", "Mme": "Mrs", "Capt": "Rare", "Sir": "Rare"
}
train['Title'] = train['Title'].map(title_map)
test['Title'] = test['Title'].map(title_map)

In [8]:
for df in [train, test]:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

In [10]:
train['Age'] = train['Age'].fillna(train['Age'].median())
test['Age'] = test['Age'].fillna(test['Age'].median())

train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
test['Embarked'] = test['Embarked'].fillna(test['Embarked'].mode()[0])

test['Fare'] = test['Fare'].fillna(test['Fare'].median())

In [12]:
le = LabelEncoder()
for col in ['Sex', 'Embarked', 'Title']:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

# 3. Select Features

In [15]:
features = ['Pclass','Sex','Age','Fare','Embarked',
            'FamilySize','IsAlone','Title']

X = train[features]
y = train['Survived']
X_test = test[features]

# 4. Train/Validation Split

In [16]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

 # 5. Model (XGBoost)

In [1]:
model = XGBClassifier(
    n_estimators=700,
    max_depth=4,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

# Validation
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))

# Cross-validation (5-fold)
cv_score = cross_val_score(model, X, y, cv=5).mean()
print("Cross-validation Accuracy:", cv_score)

NameError: name 'XGBClassifier' is not defined

# 6. Predict on Test

In [18]:
predictions = model.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": predictions
})
submission.to_csv("submission.csv", index=False)
print("✅ Submission file created: submission.csv")

✅ Submission file created: submission.csv
