# Below

In [141]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [142]:
train = pd.read_csv('train.csv')

In [143]:
test = pd.read_csv('test.csv')

In [144]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


| Variable  | Definition                      | Key                           |
|-----------|--------------------------------|------------------------------|
| survival  | Survival                       | 0 = No, 1 = Yes              |
| pclass    | Ticket class                   | 1 = 1st, 2 = 2nd, 3 = 3rd    |
| sex       | Sex                             |                              |
| Age       | Age in years                    |                              |
| sibsp     | # of siblings/spouses aboard    |                              |
| parch     | # of parents/children aboard    |                              |
| ticket    | Ticket number                   |                              |
| fare      | Passenger fare                  |                              |
| cabin     | Cabin number                    |                              |
| embarked  | Port of Embarkation             | C = Cherbourg, Q = Queenstown, S = Southampton |


In [145]:
train.shape

(891, 12)

In [146]:
test.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1
Cabin,327


## PreProcess Data

In [147]:
def preprocess_train(df):
    df = df.copy()
    # Fill missing values
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    # Drop unnecessary columns
    df.drop(columns=['Cabin', 'Ticket', 'Name'], inplace=True)

    # One-Hot Encoding: fit encoder on train data
    ohe = OneHotEncoder(drop='first', sparse_output=False)
    encoded = pd.DataFrame(ohe.fit_transform(df[['Sex', 'Embarked']]),
                           columns=ohe.get_feature_names_out(['Sex', 'Embarked']))

    # Drop original categorical columns and merge encoded ones
    df = df.drop(columns=['Sex', 'Embarked']).reset_index(drop=True)
    df_encoded = pd.concat([df, encoded], axis=1)
    return df_encoded, ohe

In [148]:
def preprocess_test(df, ohe):
    df = df.copy()
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df.drop(columns=['Cabin', 'Ticket', 'Name'], inplace=True)

    # Use the fitted encoder from train data
    encoded = pd.DataFrame(ohe.transform(df[['Sex', 'Embarked']]),
                           columns=ohe.get_feature_names_out(['Sex', 'Embarked']))

    df = df.drop(columns=['Sex', 'Embarked']).reset_index(drop=True)
    df_encoded = pd.concat([df, encoded], axis=1)
    return df_encoded


In [149]:
train, ohe = preprocess_train(train)
test = preprocess_test(test, ohe)

In [150]:
# Remove any leftover rows with NaN values (if any)
train = train.dropna()
test = test.dropna()

In [151]:
print("Train null values:\n", train.isnull().sum())
print("Test null values:\n", test.isnull().sum())

Train null values:
 PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_male       0
Embarked_Q     0
Embarked_S     0
dtype: int64
Test null values:
 PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_male       0
Embarked_Q     0
Embarked_S     0
dtype: int64


## Label Splitting

In [152]:
# Save PassengerId from test for submission later
test_ids = test['PassengerId']

In [153]:
Xtrain = train.drop(columns=['Survived'], axis=1)

In [154]:
Ytrain = train[['Survived']]  # Only target column

In [155]:
test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,1.0,1.0,0.0
1,893,3,47.0,1,0,7.0,0.0,0.0,1.0
2,894,2,62.0,0,0,9.6875,1.0,1.0,0.0
3,895,3,27.0,0,0,8.6625,1.0,0.0,1.0
4,896,3,22.0,1,1,12.2875,0.0,0.0,1.0


In [165]:
model = xgb.XGBClassifier(random_state=42,
                              use_label_encoder=False,
                              eval_metric='logloss')

In [166]:
# Ytrain.dropna(inplace=True)
Ytrain.isnull().sum()


Unnamed: 0,0
Survived,0


In [169]:
model.fit(Xtrain, Ytrain)

Parameters: { "use_label_encoder" } are not used.



In [170]:
X_test = test

In [171]:
Xtrain.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.25,1.0,0.0,1.0
1,2,1,38.0,1,0,71.2833,0.0,0.0,0.0
2,3,3,26.0,0,0,7.925,0.0,0.0,1.0
3,4,1,35.0,1,0,53.1,0.0,0.0,1.0
4,5,3,35.0,0,0,8.05,1.0,0.0,1.0


In [173]:
Ypred = model.predict(X_test)

In [174]:
submission = pd.DataFrame({
    'PassengerId': test_ids,  # from the original test set
    'Survived': Ypred
})
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [176]:
submission.to_csv('submission2.csv', index=False)
