In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [49]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, KFold

In [3]:
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')
combine = [train_df, test_df]

In [4]:
# my code

In [5]:
train_df.shape

(891, 12)

In [6]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


Data wrangling -> Data preprocessing
- Age: fill na with some logic -> bin?
- Embarked: fill na with mode -> one hot encoding
- Fare: fill na in test set -> bin
- Sex: to 0 1

drop:  name, cabin, ticket, id

In [9]:
# embarked fillna
train_df.fillna({'Embarked': train_df['Embarked'].mode()[0]}, inplace=True)


In [10]:
# Fare fillna (test set)
test_df.fillna({'Fare': test_df['Fare'].mode()[0]}, inplace=True)

In [11]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [12]:
train_df.groupby(['Pclass', 'Embarked'])['Age'].median()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age
Pclass,Embarked,Unnamed: 2_level_1
1,C,36.5
1,Q,38.5
1,S,37.5
2,C,25.0
2,Q,43.5
2,S,30.0
3,C,20.0
3,Q,21.5
3,S,25.0


In [13]:
# Age, fillna
train_df['Age'] = train_df.groupby(['Pclass', 'Embarked'])['Age'].transform(lambda x: x.fillna(x.median()))

In [14]:
group_median = train_df.groupby(['Pclass', 'Embarked'])['Age'].median()

In [15]:
def fill_age(row):
    if pd.isna(row['Age']):
        return group_median.get((row['Pclass'], row['Embarked']), train_df['Age'].median())
    return row['Age']


In [16]:
train_df['Age'] = train_df.apply(fill_age, axis=1)
test_df['Age'] = test_df.apply(fill_age, axis=1)

In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [18]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [19]:
# Embarked, preprocess
train_df = pd.get_dummies(train_df, columns=['Embarked'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True)

In [20]:
# Fare, preprocess
train_fare_binned, bin_edges = pd.qcut(train_df['Fare'], q=4, retbins=True, duplicates='drop')

# Apply to training data
train_df['Fare_bin'] = pd.cut(train_df['Fare'], bins=bin_edges, labels=False, include_lowest=True)

# Apply to test data using same bin edges
test_df['Fare_bin'] = pd.cut(test_df['Fare'], bins=bin_edges, labels=False, include_lowest=True)


In [21]:
train_df = pd.get_dummies(train_df, columns=['Fare_bin'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Fare_bin'], drop_first=True)

In [22]:
# Sex preprocess
train_df = pd.get_dummies(train_df, columns=['Sex'], drop_first=True)

In [23]:
test_df = pd.get_dummies(test_df, columns=['Sex'], drop_first=True)


In [24]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_Q,Embarked_S,Fare_bin_1,Fare_bin_2,Fare_bin_3,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,False,True,False,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,False,False,False,False,True,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,False,True,True,False,False,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,False,True,False,False,True,False
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,False,True,True,False,False,True


In [25]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_Q,Embarked_S,Fare_bin_1,Fare_bin_2,Fare_bin_3,Sex_male
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,,True,False,False,False,False,True
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,,False,True,False,False,False,False
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,,True,False,True,False,False,True
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,,False,True,True,False,False,True
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,,False,True,True,False,False,False


In [26]:
#drop unused column

train_df = train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Fare'], axis=1)
test_df = test_df.drop(['Name', 'Ticket', 'Cabin', 'Fare'], axis=1)
combine = [train_df, test_df]


In [27]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 10), (891,), (418, 10))

In [28]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Embarked_Q,Embarked_S,Fare_bin_1,Fare_bin_2,Fare_bin_3,Sex_male
0,3,22.0,1,0,False,True,False,False,False,True
1,1,38.0,1,0,False,False,False,False,True,False
2,3,26.0,0,0,False,True,True,False,False,False
3,1,35.0,1,0,False,True,False,False,True,False
4,3,35.0,0,0,False,True,True,False,False,True


In [29]:
Y_train.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [30]:
X_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Embarked_Q,Embarked_S,Fare_bin_1,Fare_bin_2,Fare_bin_3,Sex_male
0,3,34.5,0,0,True,False,False,False,False,True
1,3,47.0,1,0,False,True,False,False,False,False
2,2,62.0,0,0,True,False,True,False,False,True
3,3,27.0,0,0,False,True,True,False,False,True
4,3,22.0,1,1,False,True,True,False,False,False


In [31]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


80.81

In [32]:
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
8,Fare_bin_3,0.902573
7,Fare_bin_2,0.846546
6,Fare_bin_1,0.339105
4,Embarked_Q,0.098969
1,Age,-0.044981
3,Parch,-0.204341
5,Embarked_S,-0.329609
2,SibSp,-0.427514
0,Pclass,-0.98946
9,Sex_male,-2.578588


Next we model using Support Vector Machines which are supervised learning models with associated learning algorithms that analyze data used for classification and regression analysis. Given a set of training samples, each marked as belonging to one or the other of **two categories**, an SVM training algorithm builds a model that assigns new test samples to one category or the other, making it a non-probabilistic binary linear classifier. Reference [Wikipedia](https://en.wikipedia.org/wiki/Support_vector_machine).

Note that the model generates a confidence score which is higher than Logistics Regression model.

In [33]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

63.86

In pattern recognition, the k-Nearest Neighbors algorithm (or k-NN for short) is a non-parametric method used for classification and regression. A sample is classified by a majority vote of its neighbors, with the sample being assigned to the class most common among its k nearest neighbors (k is a positive integer, typically small). If k = 1, then the object is simply assigned to the class of that single nearest neighbor. Reference [Wikipedia](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm).

KNN confidence score is better than Logistics Regression but worse than SVM.

In [34]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

87.32

In machine learning, naive Bayes classifiers are a family of simple probabilistic classifiers based on applying Bayes' theorem with strong (naive) independence assumptions between the features. Naive Bayes classifiers are highly scalable, requiring a number of parameters linear in the number of variables (features) in a learning problem. Reference [Wikipedia](https://en.wikipedia.org/wiki/Naive_Bayes_classifier).

The model generated confidence score is the lowest among the models evaluated so far.

In [35]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

76.43

The perceptron is an algorithm for supervised learning of binary classifiers (functions that can decide whether an input, represented by a vector of numbers, belongs to some specific class or not). It is a type of linear classifier, i.e. a classification algorithm that makes its predictions based on a linear predictor function combining a set of weights with the feature vector. The algorithm allows for online learning, in that it processes elements in the training set one at a time. Reference [Wikipedia](https://en.wikipedia.org/wiki/Perceptron).

In [36]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

80.02

In [37]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

80.58

In [38]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

80.92

This model uses a decision tree as a predictive model which maps features (tree branches) to conclusions about the target value (tree leaves). Tree models where the target variable can take a finite set of values are called classification trees; in these tree structures, leaves represent class labels and branches represent conjunctions of features that lead to those class labels. Decision trees where the target variable can take continuous values (typically real numbers) are called regression trees. Reference [Wikipedia](https://en.wikipedia.org/wiki/Decision_tree_learning).

The model confidence score is the highest among models evaluated so far.

In [39]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

95.06

The next model Random Forests is one of the most popular. Random forests or random decision forests are an ensemble learning method for classification, regression and other tasks, that operate by constructing a multitude of decision trees (n_estimators=100) at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. Reference [Wikipedia](https://en.wikipedia.org/wiki/Random_forest).

The model confidence score is the highest among models evaluated so far. We decide to use this model's output (Y_pred) for creating our competition submission of results.

In [83]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)

In [55]:
def k_fold_test(model, n_splits):
  # Define the KFold cross-validator
  kf = KFold(n_splits=5, shuffle=True, random_state=42)

  # Evaluate model using cross-validation
  cv_scores = cross_val_score(model, X_train, Y_train, cv=kf, scoring='accuracy')

  print("Cross-validation scores:", cv_scores)
  print("Mean accuracy:", np.mean(cv_scores))
  print("Standard deviation:", np.std(cv_scores))

In [56]:
random_forest2 = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42
)

In [85]:
random_forest3 = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42
)

In [74]:
random_forest4 = RandomForestClassifier(
    n_estimators=400,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42
)

In [75]:
random_forest5 = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42
)

In [80]:
random_forest6 = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42
)

In [130]:
random_forest7 = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    max_leaf_nodes=25,           # Limit number of terminal nodes
    random_state=42
)

In [58]:
k_fold_test(random_forest, 5)

Cross-validation scores: [0.81005587 0.7752809  0.81460674 0.79775281 0.79775281]
Mean accuracy: 0.7990898248697508
Standard deviation: 0.013648948239481614


In [57]:
k_fold_test(random_forest2, 5)

Cross-validation scores: [0.83240223 0.80898876 0.85955056 0.82022472 0.80898876]
Mean accuracy: 0.8260310087251271
Standard deviation: 0.018861294571069807


In [82]:
k_fold_test(random_forest3, 5)

Cross-validation scores: [0.83240223 0.81460674 0.85955056 0.8258427  0.82022472]
Mean accuracy: 0.830525390747599
Standard deviation: 0.015667915940501712


In [72]:
k_fold_test(random_forest4, 5)

Cross-validation scores: [0.82681564 0.81460674 0.85955056 0.82022472 0.81460674]
Mean accuracy: 0.8271608813006089
Standard deviation: 0.016807937334616482


In [76]:
k_fold_test(random_forest5, 5)

Cross-validation scores: [0.81564246 0.81460674 0.88202247 0.79775281 0.8258427 ]
Mean accuracy: 0.8271734354403364
Standard deviation: 0.028867442427115635


In [81]:
k_fold_test(random_forest6, 5)

Cross-validation scores: [0.82681564 0.80337079 0.85955056 0.82022472 0.81460674]
Mean accuracy: 0.824913690289373
Standard deviation: 0.01895070782602929


In [131]:
k_fold_test(random_forest7, 5)

Cross-validation scores: [0.80446927 0.8258427  0.87078652 0.81460674 0.84831461]
Mean accuracy: 0.832803967108154
Standard deviation: 0.02394037250746816


best model: random_forest7

In [132]:
random_forest7.fit(X_train, Y_train)
Y_pred = random_forest7.predict(X_test)
random_forest7.score(X_train, Y_train)
acc_random_forest = round(random_forest7.score(X_train, Y_train) * 100, 2)
acc_random_forest

86.2

In [136]:
importances = random_forest7.feature_importances_

# Create a DataFrame
feat_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Display top features
print(feat_df)

      Feature  Importance
9    Sex_male    0.476243
0      Pclass    0.173706
1         Age    0.142761
2       SibSp    0.054058
8  Fare_bin_3    0.045964
3       Parch    0.039453
5  Embarked_S    0.026519
7  Fare_bin_2    0.018875
6  Fare_bin_1    0.012937
4  Embarked_Q    0.009484


### Model evaluation

We can now rank our evaluation of all the models to choose the best one for our problem. While both Decision Tree and Random Forest score the same, we choose to use Random Forest as they correct for decision trees' habit of overfitting to their training set.

In [87]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression',
              'Random Forest', 'Naive Bayes', 'Perceptron',
              'Stochastic Gradient Decent', 'Linear SVC',
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log,
              acc_random_forest, acc_gaussian, acc_perceptron,
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
8,Decision Tree,95.06
3,Random Forest,89.56
1,KNN,87.32
6,Stochastic Gradient Decent,80.92
2,Logistic Regression,80.81
7,Linear SVC,80.58
5,Perceptron,80.02
4,Naive Bayes,76.43
0,Support Vector Machines,63.86


In [134]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('/content/drive/MyDrive/Kaggle/titanic/submission4.csv', index=False)

Our submission to the competition site Kaggle results in scoring 3,883 of 6,082 competition entries. This result is indicative while the competition is running. This result only accounts for part of the submission dataset. Not bad for our first attempt. Any suggestions to improve our score are most welcome.

## References

This notebook has been created based on great work done solving the Titanic competition and other sources.

- [A journey through Titanic](https://www.kaggle.com/omarelgabry/titanic/a-journey-through-titanic)
- [Getting Started with Pandas: Kaggle's Titanic Competition](https://www.kaggle.com/c/titanic/details/getting-started-with-random-forests)
- [Titanic Best Working Classifier](https://www.kaggle.com/sinakhorami/titanic/titanic-best-working-classifier)