In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
# Load data 
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")
train_data_size = len(train_data)
test_data_size = len(test_data)
print(f'Number of samples: Training = {train_data_size} Test = {test_data_size}')
print('Columns:', [col for col in train_data])
train_data.head()
# sibsp: # of siblings / spouses aboard the Titanic	
# parch: # of parents / children aboard the Titanic

In [None]:
data = train_data
data.info()

In [None]:
# Missing values per column
for col in train_data:
    vc = train_data[col].notna().value_counts()
    num_na = train_data_size - vc.loc[True]
    if num_na > 0:
        print(f'{num_na} missing in {col}')

# Too many missing in Cabin. Some people have multiple cabins (family or friends?)
# TODO: Age: Estimate using name prefix / Done
# TODO: Embark: Two values missing 
# TODO: Name: Extract last name and title
# TODO: Is Father? Is Mother?
# TODO: Clean up title further

In [None]:
data = train_data

data['FamilyTotal'] = data['SibSp'] + data['Parch']

data['LastName'] = data['Name'].astype('str', copy=True)
data['LastName'] = data['LastName'].map(lambda x: str(x).split(',')[0].lower().strip())
print(f"LastName: {data['LastName'].nunique()} unique out of {len(data)}")

data['Title'] = data['Name'].astype('str', copy=True)
data['Title'] = data['Title'].map(lambda x: str(x).split(',')[1].lstrip().strip().split('.')[0].lower())
data['Title'] = data['Title'].map(lambda x: 'miss' if str(x) == 'mlle' else x)
data['Title'] = data['Title'].map(lambda x: 'mrs' if str(x) == 'mme' else x)
print(f"Title: {data['Title'].nunique()} unique out of {len(data)}")
data['Title'].value_counts()

In [None]:
d = data[['Age','Title']]
print(len(d))
d = d.dropna()
print(len(d))
mean_age_by_title = d.groupby('Title').mean()
mean_age_by_title.head()

In [None]:
def average_age(age, title):
    if pd.isna(age):
        return mean_age_by_title.at[title,'Age']
    else:
        return age

data['Age_filled'] = data.apply(lambda x: average_age(age=x['Age'], title=x['Title']), axis=1)

data['Age_filled'] = data['Age_filled'].map(lambda x: np.ceil(x))


In [None]:
# vc = train_data.loc[train_data.Cabin.notna(), 'Survived'].value_counts()
# test_data['Embarked'].notna().value_counts()
# data['Noble'] = 0
# idx = data['Title'] == 'jonkheer'
# data['Noble'].loc[idx] = True
# data['Title'].loc[idx] = 'mr'
# idx = data['Title'] == 'jonkvrouw'
# data['Noble'].loc[idx] = True
# check missing values - age has missing

# check null/nan values
# is it possible to group families?
# what does master, miss, mr, mrs mean?
# do we need to worry whether data is consistent or reliable? For example, does Fare correlate with Pclass?

# sns.pairplot(data=data[['FamilyTotal','Age_filled','Survived']], hue="Survived")

# sns.set_theme(style="ticks")
# f, ax = plt.subplots(figsize=(7, 5))
# sns.despine(f)
# sns.histplot(
#     train_data['Fare'])
# ax.xaxis.set_major_formatter(mpl.ticker.ScalarFormatter())
# ax.set_xticks([500, 1000, 2000, 5000, 10000])

In [None]:
# Baseline classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

y = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])

skf = StratifiedKFold(n_splits=10)
acc_train = []
acc_test  = []
for train_index, test_index in skf.split(X, y):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=1)
    model.fit(X_train, y_train)
    # print('train -  {}   |   test -  {}'.format(np.bincount(y_train), np.bincount(y_test)))
    print(f' Train: {model.score(X_train, y_train):.2f} Test: {model.score(X_test, y_test):.2f}')
    acc_train.append(model.score(X_train, y_train))
    acc_test.append(model.score(X_test, y_test))
    # break
print(f'Avg train accuracy: {np.mean(acc_train):.2f} {np.std(acc_train):.3f}')
print(f'Avg test  accuracy: {np.mean(acc_test):.2f} {np.std(acc_train):.3f}')
# model.fit(X, y)
# print(f'Baseline Accuracy: {model.score(X, y):.2f}')

In [None]:
# Current classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

y = data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch", "Title", "FamilyTotal", "Age_filled", "Embarked"]

# pd.concat([df1,df2], axis=1)
# TODO: This is super broken! Encode variables properly
# X = pd.get_dummies(data[features])
X = data[features]

skf = StratifiedKFold(n_splits=10)
acc_train = []
acc_test  = []
for train_index, test_index in skf.split(X, y):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=1)
    model.fit(X_train, y_train)
    # print('train -  {}   |   test -  {}'.format(np.bincount(y_train), np.bincount(y_test)))
    print(f' Train: {model.score(X_train, y_train):.2f} Test: {model.score(X_test, y_test):.2f}')
    acc_train.append(model.score(X_train, y_train))
    acc_test.append(model.score(X_test, y_test))
    # break
print(f'Avg train accuracy: {np.mean(acc_train):.2f} {np.std(acc_train):.3f}')
print(f'Avg test  accuracy: {np.mean(acc_test):.2f} {np.std(acc_train):.3f}')


In [86]:
pd.get_dummies(data[['Embarked']]).head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
