In [None]:
# |export
#|default_exp p00_titanic_from_scratch


In [None]:
# |export
import os
import time
import pathlib
import argparse
import torch
from torch import tensor



In [None]:
# |export
start_time=time.time()
debug=True
_code_git_version="6c2f9a414e8bda5836211266611729864e5ea993"
_code_repository="https://github.com/plops/cl-py-generator/tree/master/example/96_colab_fastai/source/"
_code_generation_time="19:07:01 of Sunday, 2022-08-28 (GMT+1)"
start_time=time.time()
debug=True


In [None]:
# |export
parser=argparse.ArgumentParser()
parser.add_argument("-v", "--verbose", help="enable verbose output", action="store_true")
args=parser.parse_args()


In [None]:
# |export
path=pathlib.Path("titanic")
if ( not(path.exists()) ):
    import zipfile
    import kaggle
    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f"{path}.zip").extractall(path)


In [None]:
# |export
import torch
import numpy as np
import pandas as pd
line_char_width=140
np.set_print_options(linewidth=line_char_width)
torch.set_print_options(linewidth=line_char_width, sci_mode=False, edgeitems=7)
pd.set_option("display_width", line_char_width)


In [None]:
# |export
df=pd.read_csv(((path)/("train.csv")))
df


In [None]:
df.isna().sum()


In [None]:
# |export
mode=df.mode().iloc[0]


In [None]:
# |export
df.fillna(modes, inplace=True)


In [None]:
df.isna().sum()


In [None]:
df.describe(include=(np.number,))


In [None]:
df.Fare.hist()


In [None]:
# |export
df["LogFare"]=np.log(((1)+(df.Fare)))


In [None]:
# histogram of logarithm of prices no longer shows the 'long' tail
df.LogFare.hist()


In [None]:
# look at the three values that are in passenger class. more details about the dataset are here: https://www.kaggle.com/competitions/titanic/data
pclasses=df.Pclass.unique()
pclasses


In [None]:
# look at columns with non-numeric values
df.describe(include=[object])


In [None]:
# |export
# replace non-numeric values with numbers by introducing new columns (dummies). The dummy columns will be added to the dataframe df and the 3 original columns are dropped.
# Cabin, Name and Ticket contain too many unique values for this approach to be useful
df=pd.get_dummies(df, columns=["Sex", "Pclass", "Embarked"])
df.columns


In [None]:
# look at the new dummy columns
added_columns=["Sex_male", "Sex_female", "Pclass_1", "Pclass_2", "Pclass_3", "Embarked_C", "Embarked_Q"]
df[added_columns].head()


In [None]:
# |export
# create dependent variable as tensor
t_dep=tensor(df.Survived)


In [None]:
# |export
# independent variables are all continuous variables of interest and the newly created columns
indep_columns=((["Age", "SipSp", "Parch", "LogFare"])+(added_columns))
t_indep=tensor(df[indep_columns].values, dtype=torch.float)
t_indep


In [None]:
t_indep.shape


In [None]:
# set up linear model. first we calculate manually a single step for the loss of every row in the dataset. we start with a random coefficient in (-.5,.5) for each column of t_indep
torch.manual_seed(442)
n_coef=t_indep.shape[1]
coefs=((torch.rand(n_coefs))-((0.50    )))
coefs


In [None]:
# our predictions are formed by multiplying a row with coefficients and summing them up. we don't need to introduce a bias (or intercept) term by introducing a column containing only ones. Such a 'one' is already present in each row in either the dummy column Sex_male or Sex_female.
((t_indep)*(coefs))


In [None]:
# we have a potential problem with the first column Age. Its values are bigger in average than the values in other columns
# In the lecture Jeremy mentions two options to normalize Age I can think of two more methods: 1) divide by maximum or 2) subtract mean and divide by std 3) subtract median and divide by MAD 4) find lower 2 perscentile and upper 2 percentile increase the value gap by +/- 10% and use this interval to normalize the input values. In the book jeremy uses 1). 1) and 3) differ by how they handle outliers. The maximum will be influenced a lot by outliers. I would like to know if 3) is better than 1) for typical problems. I think that boils down to how big the training dataset is. Once it is big enough there may be always enough outliers to ensure even the maximum is stable.
if ( True ):
    # method 1)
    vals, indices=t_indep.max(dim=0)
    t_indep=((t_indep)/(vals))
if ( False ):
    # method 2)
    means, indices1=t_indep.mean(dim=0)
    stdts, indices2=t_indep.std(dim=0)
    t_indep=((((t_indep)-(means)))/(stds))


In [None]:
# create predictions by adding up the rows of the product
preds=((t_indep)*(coefs)).sum(axis=1)


In [None]:
# look at first few
preds[10:]
# as the coefficents were random these predictions are no good


In [None]:
# in order to improve the predictions modify the coefficients with gradient descent
# define the loss as the average error between predictions and the dependent
loss=torch.abs(((preds)-(t_deps))).mean()
loss


In [None]:
# |export
# using what we learned in the previous cells create functions to compute predictions and loss
def calc_preds(coeffs=None, indeps=None):
    return ((indeps)*(coefs)).sum(axis=1)
def calc_loss(coeffs=None, indeps=None, deps=None):
    preds=calc_preds(coeffs=coeffs, indeps=indeps)
    loss=torch.abs(((preds)-(deps))).mean()
    return loss


In [None]:
# perform a single 'epoch' of gradient descent manually
# tell pytorch that we want to calculate the gradients for the coeffs object. the underscore indicates that the coeffs object will be modified in place
coeffs.requires_grad_()


In [None]:
# compute the loss, pytorch will perform book keeping to compute gradients later
loss


In [None]:
# compute gradient
loss.backward()
coeffs.grad
# note that every call of backward() adds the gradients to grad


In [None]:
# calling the steps a second time will double the values in .grad
loss=calc_loss(coeffs=coeffs, indeps=t_indep, deps=t_dep)
loss.backward()
coeffs.grad


In [None]:
# we can now perform a single gradient step. the loss should reduce
loss=calc_loss(coeffs=coeffs, indeps=t_indep, deps=t_dep)
loss.backward()
with torch.no_grad():
    coeffs.sub_(((coeffs.grad)*((0.10    ))))
    coeffs.grad.zero_()
    print(calc_loss(coeffs=coeffs, indeps=t_indep, deps=t_dep))
# a.sub_(b) subtracts the gradient from coeffs in place (a = a - b) and zero_ clears the gradients


In [None]:
# |export
# before we can perform training, we have to create a validation dataset
# we do that in the same way as the fastai library does
import fastai.data.transforms
# get training (trn) and validation indices (val)
trn, val=(fastai.data.transforms.RandomSplitter(seed=42))((df))


In [None]:
# |export
trn_indep=t_indep[trn]
val_indep=t_indep[val]
trn_dep=t_dep[trn]
val_dep=t_dep[val]
len(trn_indep), len(val_indep)
