# Define problem

* Get a high accuracy for the Titanic train and test set on Kaggle (Titanic has already sunk, so we're just looking for the most accurate prediction)
* Learn basic ML workflow

# Read data
Train and test set are provided from Kaggle

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('train.csv')
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
test_df = pd.read_csv('test.csv')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [4]:
test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [5]:
dataframes = [train_df, test_df]

# Train-Test split

# Exploratory Data Analysis

# Cleaning

In [6]:
for df in dataframes:
    df.drop(['Cabin', 'Ticket'], axis=1, inplace=True)
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S


In [7]:
mean_age = train_df['Age'].mean()
mean_fare = train_df['Fare'].mean()

for df in dataframes:
    df = df.fillna(value={'Age': mean_age}, inplace=True)
    
for df in dataframes:
    df = df.fillna(value={'Fare': mean_fare}, inplace=True)

In [8]:
train_df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       2
dtype: int64

In [9]:
test_df.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

# Feature Engineering

### One-Hot-Encoding

Embarked

In [10]:
# returns changed dataframe
def onehot(df, col):
    oh = pd.get_dummies(df[col], prefix=col) # get unique values from training set
    return pd.concat([df, oh], axis=1) # glues two dataframes together horizontally

In [11]:
train_df = onehot(train_df, 'Embarked')
test_df = onehot(test_df, 'Embarked')

In [12]:
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,0,1


Pclass

In [13]:
train_df = onehot(train_df, 'Pclass')
test_df = onehot(test_df, 'Pclass')

In [14]:
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,892,3,"Kelly, Mr. James",male,34.500000,0,0,7.8292,Q,0,1,0,0,0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.000000,1,0,7.0000,S,0,0,1,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.000000,0,0,9.6875,Q,0,1,0,0,1,0
3,895,3,"Wirz, Mr. Albert",male,27.000000,0,0,8.6625,S,0,0,1,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.000000,1,1,12.2875,S,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,29.699118,0,0,8.0500,S,0,0,1,0,0,1
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.000000,0,0,108.9000,C,1,0,0,1,0,0
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.500000,0,0,7.2500,S,0,0,1,0,0,1
416,1308,3,"Ware, Mr. Frederick",male,29.699118,0,0,8.0500,S,0,0,1,0,0,1


### Target Encoding

In [15]:
# returns new column
def target_encoding(df, col):
    means = train_df.groupby(col)['Survived'].mean() # Calculate mean from training set
    return df[col].replace(means.to_dict())

In [16]:
train_df['sex_target_enc'] = target_encoding(train_df, 'Sex')
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,sex_target_enc
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,0,1,0,0,1,0.188908
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0,0,1,0,0,0.742038
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,0,1,0,0,1,0.742038


In [17]:
test_df['sex_target_enc'] = target_encoding(test_df, 'Sex')
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,sex_target_enc
0,892,3,"Kelly, Mr. James",male,34.500000,0,0,7.8292,Q,0,1,0,0,0,1,0.188908
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.000000,1,0,7.0000,S,0,0,1,0,0,1,0.742038
2,894,2,"Myles, Mr. Thomas Francis",male,62.000000,0,0,9.6875,Q,0,1,0,0,1,0,0.188908
3,895,3,"Wirz, Mr. Albert",male,27.000000,0,0,8.6625,S,0,0,1,0,0,1,0.188908
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.000000,1,1,12.2875,S,0,0,1,0,0,1,0.742038
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,29.699118,0,0,8.0500,S,0,0,1,0,0,1,0.188908
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.000000,0,0,108.9000,C,1,0,0,1,0,0,0.742038
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.500000,0,0,7.2500,S,0,0,1,0,0,1,0.188908
416,1308,3,"Ware, Mr. Frederick",male,29.699118,0,0,8.0500,S,0,0,1,0,0,1,0.188908


### Binning

In [18]:
# returns changed dataframe
def binning(df, col, bins):
    buckets = pd.cut(df[col], labels = [str(i) for i in range(bins)], bins=5).astype(int) # creates equally wide buckets
    binning = pd.get_dummies(buckets, prefix=col)
    return pd.concat([df, binning], axis=1)

Fare

In [19]:
train_df = binning(train_df, 'Fare', 5)

In [20]:
test_df = binning(test_df, 'Fare', 5)

Age

In [21]:
train_df = binning(train_df, 'Age', 5)

In [22]:
test_df = binning(test_df, 'Age', 5)

In [23]:
train_df.tail(100)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,...,sex_target_enc,Fare_0,Fare_1,Fare_2,Fare_4,Age_0,Age_1,Age_2,Age_3,Age_4
791,792,0,2,"Gaskell, Mr. Alfred",male,16.000000,0,0,26.0000,S,...,0.188908,1,0,0,0,1,0,0,0,0
792,793,0,3,"Sage, Miss. Stella Anna",female,29.699118,8,2,69.5500,S,...,0.742038,1,0,0,0,0,1,0,0,0
793,794,0,1,"Hoyt, Mr. William Fisher",male,29.699118,0,0,30.6958,C,...,0.188908,1,0,0,0,0,1,0,0,0
794,795,0,3,"Dantcheff, Mr. Ristiu",male,25.000000,0,0,7.8958,S,...,0.188908,1,0,0,0,0,1,0,0,0
795,796,0,2,"Otter, Mr. Richard",male,39.000000,0,0,13.0000,S,...,0.188908,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,13.0000,S,...,0.188908,1,0,0,0,0,1,0,0,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,30.0000,S,...,0.742038,1,0,0,0,0,1,0,0,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,23.4500,S,...,0.742038,1,0,0,0,0,1,0,0,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,30.0000,C,...,0.188908,1,0,0,0,0,1,0,0,0


### Scaling

In [24]:
def scaling(df, col):
    scaled_col = (train_df[col] - train_df[col].min()) / (train_df[col].max() - train_df[col].min()) # scale based on training set
    return scaled_col

In [25]:
# train_df['Fare_scaled'] = scaling(train_df, 'Fare')
# test_df['Fare_scaled'] = scaling(test_df, 'Fare')

In [26]:
# train_df['Age_scaled'] = scaling(train_df, 'Age')
# test_df['Age_scaled'] = scaling(test_df, 'Age')

In [27]:
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,...,sex_target_enc,Fare_0,Fare_1,Fare_2,Fare_4,Age_0,Age_1,Age_2,Age_3,Age_4
0,892,3,"Kelly, Mr. James",male,34.500000,0,0,7.8292,Q,0,...,0.188908,1,0,0,0,0,0,1,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.000000,1,0,7.0000,S,0,...,0.742038,1,0,0,0,0,0,0,1,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.000000,0,0,9.6875,Q,0,...,0.188908,1,0,0,0,0,0,0,0,1
3,895,3,"Wirz, Mr. Albert",male,27.000000,0,0,8.6625,S,0,...,0.188908,1,0,0,0,0,1,0,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.000000,1,1,12.2875,S,0,...,0.742038,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,29.699118,0,0,8.0500,S,0,...,0.188908,1,0,0,0,0,1,0,0,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.000000,0,0,108.9000,C,1,...,0.742038,0,1,0,0,0,0,1,0,0
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.500000,0,0,7.2500,S,0,...,0.188908,1,0,0,0,0,0,1,0,0
416,1308,3,"Ware, Mr. Frederick",male,29.699118,0,0,8.0500,S,0,...,0.188908,1,0,0,0,0,1,0,0,0


### Interaction Terms

In [28]:
train_df['SibSp*Parch'] = train_df['SibSp'] * train_df['Parch'].astype(int)
test_df['SibSp*Parch'] = test_df['SibSp'] * test_df['Parch'].astype(int)

In [29]:
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,...,Fare_0,Fare_1,Fare_2,Fare_4,Age_0,Age_1,Age_2,Age_3,Age_4,SibSp*Parch
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,...,1,0,0,0,0,1,0,0,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,...,1,0,0,0,0,0,1,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,...,1,0,0,0,0,1,0,0,0,0


### Safety check

In [30]:
new_features = train_df.columns.values[10:]
new_features

array(['Embarked_C', 'Embarked_Q', 'Embarked_S', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'sex_target_enc', 'Fare_0', 'Fare_1', 'Fare_2',
       'Fare_4', 'Age_0', 'Age_1', 'Age_2', 'Age_3', 'Age_4',
       'SibSp*Parch'], dtype=object)

In [31]:
train_df[new_features].isna().sum()

Embarked_C        0
Embarked_Q        0
Embarked_S        0
Pclass_1          0
Pclass_2          0
Pclass_3          0
sex_target_enc    0
Fare_0            0
Fare_1            0
Fare_2            0
Fare_4            0
Age_0             0
Age_1             0
Age_2             0
Age_3             0
Age_4             0
SibSp*Parch       0
dtype: int64

In [32]:
test_df[new_features].isna().sum()

Embarked_C        0
Embarked_Q        0
Embarked_S        0
Pclass_1          0
Pclass_2          0
Pclass_3          0
sex_target_enc    0
Fare_0            0
Fare_1            0
Fare_2            0
Fare_4            0
Age_0             0
Age_1             0
Age_2             0
Age_3             0
Age_4             0
SibSp*Parch       0
dtype: int64

# Model

Logistic Regression

In [33]:
X_train = train_df[new_features]
y_train = train_df["Survived"]
X_test = pd.get_dummies(test_df[new_features])

In [34]:
X_train.shape, y_train.shape

((891, 17), (891,))

In [35]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1e6, max_iter=1e6)
model.fit(X_train, y_train)

LogisticRegression(C=1000000.0, max_iter=1000000.0)

In [36]:
# calculate a metric (accuracy == % of correct predictions)
model.score(X_train, y_train)

0.8103254769921436

In [37]:
predictions = model.predict(X_test).astype(int)

RandomForest

In [38]:
# as provided in the kaggle example notebook
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test).astype(int)

In [39]:
rf_model.score(X_train, y_train)

0.8294051627384961

# Save output
Kaggle expects a .csv file with the PassengerId's for the test and the respective predictions

In [40]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('kaggle_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


Accuracy for test set with LogisticRegression = 0.76076  
Accuracy for test set with RandomForest = 0.78468