In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Additional imports
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# training data
training = pd.read_csv("/kaggle/input/titanic/train.csv")
training.head()

In [None]:
# test data
test = pd.read_csv("/kaggle/input/titanic/test.csv")
test.head()

As we can see above the test data contains the same columns that the training data, minus the *Survived* column which we is going to be predicted.

### Exploratory Analysis

In [None]:
training.info()

Looks like the *Age*, *Cabin*, and *Embarked* columns have missing values. Let's do a quick check to confirm this This is a factor we will need to consider before getting to model.

In [None]:
training.isnull().sum()

In [None]:
training.describe()

In [None]:
# selecting pure numeric columns only
training_num = training[["Age", "SibSp", "Parch", "Fare"]]
training_num.head()

In [None]:
fig, axes = plt.subplots(nrows = 2, ncols = 2, figsize = (10, 10))
axes = axes.flatten()
fig.suptitle("Distributions of Numeric Variables")

for i, column in enumerate(training_num):
    ax = axes[i]
    ax.hist(training_num[column], alpha = 0.7)
    ax.set_title(column)
    ax.set_xlabel("Values")
    ax.set_ylabel("Frequency")

plt.tight_layout()
plt.show()
    

None of these columns follow of traditional normal distribution. We can get away with this for the first 3 variables, however, normalizing *Fare* may prove useful since it has such a large range of values but is mostly centered around one point. Now that we have an idea of the distrbutions of the numeric columns, let's get a look at their correlation.

In [None]:
sns.heatmap(training_num.corr(), annot = True, fmt = ".2f")

No columns are very strongly correlated, however, SibSp and Parch have a bit of a postive correlation which makes sense as both varibles come under immediate family members. It would be no surprise to find out that families travel together.

Let's now explore some of the categorical variables, especially the *Survived* column as that is what we are trying to predict.

In [None]:
sns.set()
training["Survived"].value_counts().plot(kind = "barh", alpha= 0.9)
plt.xlabel("Survived")
plt.ylabel("Count of People")
plt.title("Count of People Who Survived");

From this graph we can see that from almost 900 passengers, about 550 did not survive the shipwreck, while over 300 of them did. Let's check the survival sorted by a few other categorical variables.

In [None]:
sns.barplot(data = training, x = "Pclass", y = "Survived", errorbar = None, alpha = 0.9)
plt.title("Survival Rate by Ticket Class");

It seems like people of higher socio-economic status were more likley to surive.

In [None]:
sns.barplot(data = training, x = "Sex", y = "Survived", errorbar = None, alpha = 0.9)
plt.title("Survival Rate by Sex");

It is clear to see that females were far more likely to survive compared to males. A probable cause of this is that women and children are typically given first preference during evacuations and disaster management.

In [None]:
sns.barplot(data = training, x = "Embarked", y = "Survived", errorbar = None, alpha = 0.9)
plt.title("Survival Rate by Port of Embark");

The rate of survival of people that embarked from Cherbourg is moderately higher than passengers of the other two ports, but not a whole lot can be inferred from this visualization.

In [None]:
sns.boxplot(data = training, x = "Survived", y = "Age")
plt.title("Spread of Age by Survived Category");

The median age for people in both categories is relatively similar. The interquartile range for the passangers that survived seems to have a slightly lower age than those who didn't and the largest non outlier ages are also smaller in the passengers that survived. Again, a likely cause for this spread is that younger people are probabliy fitter and in better condition to survive such an event.

In [None]:
training["Ticket"], training["Cabin"]

The *Ticket* and *Cabin* columns seem pretty messy and all over the place right now. If we use these columns as they are currently, it is implausiable they will provide much value when it comes to modeling so a few considertaions need to be taken on how to best manage them.

### Feature Engineering & Data Preprocessing

Before we get to changing the data by creating more variables and preprocessing, let's combine the train and test sets to ensure consistency.

In [None]:
# creating survivor column in test
test["Survived"] = np.NaN

# creating new columns for train and test to differentiate data
training["split"] = 1
test["split"] = 0

# combining dataframes
titanic = pd.concat([training, test])
titanic.head()

In the *Ticket* data, there seems to be number only rows and rows with some characters along the numbers. What the characters represent is not clear (they might be name, initials, etc) but the distinction between numeric-only and and mix of numeric and character rows could be beneficial.

In [None]:
# number-only ticket
titanic["Nums_Ticket"] = titanic.Ticket.apply(lambda k: 1 if k.isnumeric() else 0)
titanic["Nums_Ticket"].value_counts()

A good way to sort the *Cabin* data can be by the cabin letter. Since a majoirty of this column's data is missing, we can alse use that as a category within the variable. This will prevent us from having to drop these rows which would greatly impact our data and the model performance potential.

In [None]:
titanic["Cabin_Letter"] = titanic.Cabin.apply(lambda k: str(k)[0])
titanic["Cabin_Letter"].value_counts() # "n" will stand for the missing data

There are two missing values in the *Embarked* column. Since there are no ways as such to fill these, we will drop these entries. This should not make too much of a difference since we will only be losing 2 out 891 training rows and none of the test rows.

In [None]:
titanic = titanic.dropna(subset = ["Embarked"])

Time to tend to the *Age* data as it has multiple missing values in both train and test sets. It is advisible to fill these values as we do not want to lose any more rows for the training data. In the data exploration section, we saw that the distribution for *Age* does not follow a standard normal distribution and it is also skewed. As a result, filling the missing values with the median rather than the mean is more suitable.

In [None]:
titanic["Age"] = training.Age.fillna(training.Age.median())

*Pclass* has an integer dtype however, it should be treated as a category. Initially, we will convert it to a string then it can be treated with the other object types.

In [None]:
titanic["Pclass"] = titanic.Pclass.astype(str)
titanic.info()

It looks like there is 1 missing value from *Fare* which is from the test set. Again, filling this value with the median would make most sense.

In [None]:
titanic.Fare = titanic.Fare.fillna(titanic.Fare.median())

It was mentioned earlier that normalizing *Fare* could be beneficial. Let's try taking the log of all *Fare* values.

In [None]:
titanic["Fare_Normalized"] = np.log(titanic.Fare+1) # some fare values are 0 and log(0) is undefined
titanic["Fare_Normalized"].hist()

This distribution looks slightly better compared to what is was, so we"ll keep it like this.

Not all columns will be used to train the data. Variables such as *PassengerID*, *Name* will have to utility during predicition. Additionally, *Ticket*, *Cabin*, and *Fare* are redundant after creating more relevant or appropriate variables.

In [None]:
dropped_columns = ["PassengerId", "Name", "Ticket", "Fare"]
titanic_new = titanic.drop(dropped_columns, axis = 1)

In [None]:
# creating dummy variables for categorical columns
titanic_dummies = pd.get_dummies(titanic_new)
titanic_dummies.head()

In [None]:
# scaling numeric data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = titanic_dummies.copy()
scaled_data[["Age", "SibSp", "Parch", "Fare_Normalized"]] = scaler.fit_transform(scaled_data[["Age", "SibSp", "Parch", "Fare_Normalized"]])
scaled_data.head()

### Modeling

In [None]:
# Splitting data into indepent and dependent variables and train and test
X_train = scaled_data[scaled_data.split == 1]
X_train = X_train.drop(["Survived", "split"], axis = 1)

X_test = scaled_data[scaled_data.split == 0]
X_test = X_test.drop(["Survived", "split"], axis = 1)

y_train = scaled_data[scaled_data.split == 1]
y_train = y_train["Survived"]

In [None]:
# Using a Gradient Boosting model and cross-validation
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state = 66)
xgb.fit(X_train, y_train)
cv_xgb = cross_val_score(xgb, X_train, y_train, cv = 5)
print(cv_xgb), print(cv_xgb.mean())

We have received a baseline score of 81.78% on a 5 fold cross validation. Before we try to tune the model to improve performance, let's check out feature importance.

In [None]:
feature_imp = pd.Series(xgb.feature_importances_, index = X_train.columns)
feature_imp.nlargest(15).plot(kind = "barh");

It seems like the features Sex and Pclass are the most influential. This falls in line with previous analysis as we found that females were a lot more likely to survive than males and that people of higher ticket class were more liklely to survive. Let's try out another model type now.

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 66)
lr.fit(X_train, y_train)
cv_lr = cross_val_score(lr, X_train, y_train, cv = 5)
print(cv_lr), print(cv_lr.mean())

Now with a Logistic Regression model, we achieve an accuracy of about 80%. This is slightly less than that of the XGBoost model, however, it is still high enough to work with. Let's move on to tuning model performance.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
xgb_grid = {
    "n_estimators": [25, 50, 100, 250, 500, 1000],
    "eta": [0.1, 0.2, 0.3, 0.4, 0.5],
    "gamma": [0, 0.01, 0.1, 1, 10],
    "max_depth": [2, 5, 10, 15, 20, None],
    "sampling_method": ["uniform", "gradient_based"],
    "colsample_bytree": [0.25, 0.5, 0.75, 1],
    "lambda": [1, 1,5, 2],
    "alpha": [0, 0,5, 1] 
}

rs_xgb = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_grid, n_iter = 100, cv = 5, verbose = True, n_jobs = -1, random_state = 66)
rs_xgb.fit(X_train, y_train)

In [None]:
rs_xgb.best_score_, rs_xgb.best_params_

We have improved accuracy by almost 2% after tuning the model the gradient boosting. Before we use these parameters to make predictions on our test data, let's try out imporving the logistic regression.

In [None]:
from sklearn.model_selection import GridSearchCV
lr_grid = {"penalty": ["l1", "l2"],
           "C": np.logspace(-4, 4, 20),
           "solver": ["liblinear"],
           "max_iter": [1000]
}
gs_lr = GridSearchCV(estimator = lr, param_grid = lr_grid, cv = 10, n_jobs = -1, verbose = True)
gs_lr.fit(X_train, y_train)

In [None]:
gs_lr.best_score_, gs_lr.best_params_

With hyperparameter tuning, the logistic regression model has very marginally imporved in terms of accuracy- less than one percent.

### Predictions

In [None]:
xgb_predictions = rs_xgb.predict(X_test).astype(int)
lr_predictions = gs_lr.predict(X_test).astype(int)

In [None]:
# creating predictions data xgb
xgb_predictions_data = {"PassengerId": test.PassengerId, "Survived": xgb_predictions}
xgb_submission = pd.DataFrame(data = xgb_predictions_data)
xgb_submission

In [None]:
# creating predictions data lr
lr_predictions_data = {"PassengerId": test.PassengerId, "Survived": lr_predictions}
lr_submission = pd.DataFrame(data = lr_predictions_data)
lr_submission

In [None]:
# creating csv files
#xgb_submission.to_csv("submission.csv", index = False)
lr_submission.to_csv("submission.csv", index = False)