In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Data preprocessing
def preprocess_data(df):
    # Fill missing values
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

    return df

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Select features and target variable
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = train_data[features]
y = train_data['Survived']
model = XGBClassifier()
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the validation set
predictions = model.predict(X_valid)

# Calculate accuracy
accuracy = accuracy_score(y_valid, predictions)
print(f"Validation Accuracy: {accuracy:.4f}")

# Make predictions on the test set
X_test = test_data[features]
test_predictions = model.predict(X_test)
# Prepare submission file for Kaggle competition
submission = pd.DataFrame({ 'Survived': test_predictions})
submission.to_csv('submission.csv', index=False)

Validation Accuracy: 0.8881
