In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from xgboost import XGBClassifier

In [2]:
#reading and combining available data
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")
all_data = pd.concat([train_data, test_data], sort = True).reset_index(drop = True)

In [3]:
#feature creation
all_data["Family"] = all_data["SibSp"] + all_data["Parch"] + 1
all_data["Farepp"] = all_data["Fare"] / all_data["Family"]
all_data["Letter"] = [str(cabin)[0] for cabin in all_data["Cabin"]]
all_data["Group"] = [all_data.groupby(["Ticket"]).count()["PassengerId"].loc[ticket] for ticket in all_data["Ticket"]]

all_data.drop(["Cabin", "PassengerId", "Name", "Ticket"], axis = 1, inplace = True)
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       1046 non-null   float64
 1   Embarked  1307 non-null   object 
 2   Fare      1308 non-null   float64
 3   Parch     1309 non-null   int64  
 4   Pclass    1309 non-null   int64  
 5   Sex       1309 non-null   object 
 6   SibSp     1309 non-null   int64  
 7   Survived  891 non-null    float64
 8   Family    1309 non-null   int64  
 9   Farepp    1308 non-null   float64
 10  Letter    1309 non-null   object 
 11  Group     1309 non-null   int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 122.8+ KB


In [4]:
#overview of missing data
print(all_data[all_data["Fare"].isna()])
print(all_data[all_data["Embarked"].isna()])
print(all_data[all_data["Age"].isna()])
print(all_data[all_data["Letter"] == "n"])

       Age Embarked  Fare  Parch  Pclass   Sex  SibSp  Survived  Family  \
1043  60.5        S   NaN      0       3  male      0       NaN       1   

      Farepp Letter  Group  
1043     NaN      n      1  
      Age Embarked  Fare  Parch  Pclass     Sex  SibSp  Survived  Family  \
61   38.0      NaN  80.0      0       1  female      0       1.0       1   
829  62.0      NaN  80.0      0       1  female      0       1.0       1   

     Farepp Letter  Group  
61     80.0      B      2  
829    80.0      B      2  
      Age Embarked     Fare  Parch  Pclass     Sex  SibSp  Survived  Family  \
5     NaN        Q   8.4583      0       3    male      0       0.0       1   
17    NaN        S  13.0000      0       2    male      0       1.0       1   
19    NaN        C   7.2250      0       3  female      0       1.0       1   
26    NaN        C   7.2250      0       3    male      0       0.0       1   
28    NaN        Q   7.8792      0       3  female      0       1.0       1   
... 

In [5]:
#imputing missing fares data
def fare(indices):
    for index in indices:
        all_data.loc[index, "Farepp"] = all_data.groupby(["Pclass", "Family"])["Farepp"].median()[all_data.loc[index, "Pclass"]][all_data.loc[index, "Family"]]
        all_data.loc[index, "Fare"] = all_data.loc[index, "Farepp"] * all_data.loc[index, "Family"]
    return all_data
fare_indices = all_data[all_data["Fare"].isna()].index
all_data = fare(fare_indices)

In [6]:
#imputing missing embarked data
def embarked(indices):
    for index in indices:
        all_data.loc[index, "Embarked"] = all_data.groupby(["Survived", "Pclass"])["Embarked"].agg(pd.Series.mode)[all_data.loc[index, "Survived"]][all_data.loc[index, "Pclass"]]
    return all_data
embarked_indices = all_data[all_data["Embarked"].isna()].index
all_data = embarked(embarked_indices)

In [7]:
#imputing missing age data
def age(indices):
    for index in indices:
        all_data.loc[index, "Age"] = all_data.groupby(["Sex", "Pclass"])["Age"].median()[all_data.loc[index, "Sex"]][all_data.loc[index, "Pclass"]]
    return all_data
age_indices = all_data[all_data["Age"].isna()].index
all_data = age(age_indices)

In [8]:
#imputing missing cabin data
def letter(indices):
    for index in indices:
        all_data.loc[index, "Letter"] = all_data.groupby(["Sex", "Pclass"])["Letter"].agg(pd.Series.mode)[all_data.loc[index, "Sex"]][all_data.loc[index, "Pclass"]]
    return all_data
letter_indices = all_data[all_data["Letter"] == "n"].index
all_data = letter(letter_indices)

In [9]:
#encoding object data types
all_data["Sex"] = LabelEncoder().fit_transform(all_data["Sex"])
all_data["Embarked"] = LabelEncoder().fit_transform(all_data["Embarked"])
all_data["Letter"] = LabelEncoder().fit_transform(all_data["Letter"])

In [10]:
#overview of processed data
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       1309 non-null   float64
 1   Embarked  1309 non-null   int32  
 2   Fare      1309 non-null   float64
 3   Parch     1309 non-null   int64  
 4   Pclass    1309 non-null   int64  
 5   Sex       1309 non-null   int32  
 6   SibSp     1309 non-null   int64  
 7   Survived  891 non-null    float64
 8   Family    1309 non-null   int64  
 9   Farepp    1309 non-null   float64
 10  Letter    1309 non-null   int32  
 11  Group     1309 non-null   int64  
dtypes: float64(4), int32(3), int64(5)
memory usage: 107.5 KB


In [11]:
#separating train and test data
train_data = all_data[all_data["Survived"].notna()]
test_data = all_data[all_data["Survived"].isna()]

In [12]:
#separating features and target
X = train_data.drop(["Survived"], axis = 1)
y = train_data["Survived"]
X_test = test_data.drop(["Survived"], axis = 1)

In [13]:
#hyperparameter tuning
model = XGBClassifier(max_depth = 7)
cv = ShuffleSplit(n_splits = 5, test_size = 0.2)
grid = {"n_estimators": [103, 105, 107, 110, 114], "alpha": [0.5, 0.75, 1, 1.2], "lambda": [9, 12, 15, 17], "learning_rate": [0.06, 0.07, 0.08, 0.1]}
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = cv, n_jobs = -1)
grid_search.fit(X, y)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'alpha': 1.2, 'lambda': 15, 'learning_rate': 0.07, 'n_estimators': 110}
0.8402234636871508


In [14]:
#predictions
final_model = XGBClassifier(n_estimators = grid_search.best_params_["n_estimators"], max_depth = 7, alpha = grid_search.best_params_["alpha"], reg_lambda = grid_search.best_params_["lambda"], learning_rate = grid_search.best_params_["learning_rate"]).fit(X, y)
predictions = final_model.predict(X_test)
predictions_df = pd.DataFrame({"PassengerId": range(892, 1310), "Survived": predictions})

In [15]:
#checks
l = len(predictions_df)
s = sum(predictions_df["Survived"])
p = s/l

print("Total number:", l)
print("Survived:", s)
print("Survival rate:", p)

Total number: 418
Survived: 141
Survival rate: 0.3373205741626794


In [16]:
#uploading to csv
predictions_df.to_csv("titanic_survival_prediction_xgb_classifier_predictions.csv", sep=",", index=False)

Accuracy of model: 77.751%