In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [2]:
#overview of data
train_data = pd.read_csv("train_data.csv")
print(train_data.head())
print(train_data.describe())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [3]:
#replacing with more relevant information
train_data["Age_diff"] = abs(train_data["Age"] - 35)

In [4]:
#most relevant features
features = ["Pclass", "Age_diff", "SibSp", "Parch", "Fare", "Sex", "Embarked"]
X_train = train_data[features]
y_train = train_data["Survived"]

In [5]:
#preprocessing for missing values and encoding
numerical_cols = X_train.select_dtypes(exclude = "object").columns
categorical_cols = X_train.select_dtypes(include = "object").columns

numerical_transformer = SimpleImputer(strategy = "mean")
categorical_transformer = Pipeline(steps = [("imputer", SimpleImputer(strategy = "most_frequent")), ("encoder", OneHotEncoder())])

preprocessor = ColumnTransformer(transformers = [("num", numerical_transformer, numerical_cols), ("cat", categorical_transformer, categorical_cols)])

In [6]:
#cross validation and evaluation to find the best parameter
mae_list = []
for n in range(100):
    temp_model = XGBClassifier(random_state = 0, n_estimators = n)
    temp_pipe = Pipeline(steps = [("preprocessor", preprocessor), ("model", temp_model)])
    maes = -1 * cross_val_score(temp_pipe, X_train, y_train, cv = 5, scoring = "neg_mean_absolute_error")
    mae = np.mean(maes)
    mae_list.append(mae)
n = range(100)[np.argmin(mae_list)]
n

41

In [7]:
#final model
final_model = XGBClassifier(random_state = 0, n_estimators = n)
final_pipe = Pipeline(steps = [("preprocessor", preprocessor), ("model", final_model)])
final_pipe.fit(X_train, y_train)

In [8]:
#test data
test_data = pd.read_csv("test_data.csv")
test_data["Age_diff"] = abs(test_data["Age"] - 35)
X_test = test_data.copy()[features]

In [9]:
#test predictions
test_predictions = final_pipe.predict(X_test)
test_predictions_df = pd.DataFrame({"PassengerId": range(892, 1310), "Survived": test_predictions})

In [10]:
#checks
l = len(test_predictions_df)
s = sum(test_predictions_df["Survived"])
p = s/l

print("Total number:", l)
print("Survived:", s)
print("Survival rate:", p)

Total number: 418
Survived: 148
Survival rate: 0.35406698564593303


In [11]:
#uploading to csv
test_predictions_df.to_csv("titanic_XGB_predictions.csv", sep=",", index=False)

Accuracy of model: 77.033%