In [1]:
import pandas as pd
from network import *
from layer import *
import math

In [2]:
train = pd.read_csv("kaggle_inputs/train.csv")
display(train.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test = pd.read_csv("kaggle_inputs/test.csv")
display(test.head())

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
# Split in train_x and train_y
train_y = train[["PassengerId","Survived"]]
train_x = train.drop("Survived", axis=1)

In [5]:
# Concat test and train_x
df = pd.concat([train_x, test], ignore_index=True)
df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1046 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Ticket       1309 non-null   object 
 8   Fare         1308 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 112.6+ KB


## To do:
- Remove Name
- Fill NaN ages with mean
- Squeeze age between 0 and 1
- Turn Sex into binary
- Embarked get_dummies (fill NaN with mode)
- Fill NaN Fare with mean and squeeze between 0 and 1
- Fill NaN cabin with "not_specified"
- Get dummies in Cabin
- Ticket get dummies
- Pclass get dummies

In [7]:
# I won't use the columns Name and PassengerId
def remove_name(df):
    df = df.copy()
    df = df.drop(["Name"], axis=1)
    return df

In [8]:
def fill_and_squeeze_ages(df):
    df = df.copy()
    df["Age"] = df["Age"].fillna(df["Age"].mean())
    age_min = min(df["Age"])
    age_max = max(df["Age"])
    df["Age"] = df["Age"].apply(lambda x: (x - age_min)/(age_max-age_min))
    return df

In [9]:
df["Sex"].unique()

array(['male', 'female'], dtype=object)

In [10]:
def binary_sex(df):
    df = df.copy()
    df["Sex"] = df["Sex"].apply(lambda x: 1 if x == "male" else 0)
    return df

In [11]:
df["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [12]:
def embarked_fill_and_dummies(df):
    df = df.copy()
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode())
    dummies = pd.get_dummies(df["Embarked"], prefix="Embarked")
    df = df.drop("Embarked", axis=1)
    df = pd.concat([df, dummies], axis=1)
    return df

In [13]:
def fill_squeeze_fare(df):
    df = df.copy()
    df["Fare"] = df["Fare"].fillna(df["Fare"].mean())
    fare_max = max(df["Fare"])
    fare_min = min(df["Fare"])
    df["Fare"] = df["Fare"].apply(lambda x: (x - fare_min)/(fare_max-fare_min))
    return df

In [14]:
def cabin_fill_and_dummies(df):
    df = df.copy()
    df["Cabin"] = df["Cabin"].fillna("not_specified")
    dummies = pd.get_dummies(df["Cabin"], prefix="Cabin")
    df = df.drop("Cabin", axis=1)
    df = pd.concat([df, dummies], axis=1)
    return df

In [15]:
def ticket_dummies(df):
    df = df.copy()
    dummies = pd.get_dummies(df["Ticket"], prefix="Ticket")
    df = df.drop("Ticket", axis=1)
    df = pd.concat([df, dummies], axis=1)
    return df

In [16]:
def Pclass_dummies(df):
    df = df.copy()
    dummies = pd.get_dummies(df["Pclass"], prefix="Pclass")
    df = df.drop("Pclass", axis=1)
    df = pd.concat([df, dummies], axis=1)
    return df

In [17]:
def prep_df(df):
    df = df.copy()
    df = remove_name(df)
    df = fill_and_squeeze_ages(df)
    df = binary_sex(df)
    df = embarked_fill_and_dummies(df)
    df = fill_squeeze_fare(df)
    df = cabin_fill_and_dummies(df)
    df = ticket_dummies(df)
    df = Pclass_dummies(df)
    return df

In [18]:
df = prep_df(df)
df

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Cabin_A10,...,Ticket_W./C. 14266,Ticket_W./C. 6607,Ticket_W./C. 6608,Ticket_W./C. 6609,Ticket_W.E.P. 5734,Ticket_W/C 14208,Ticket_WE/P 5735,Pclass_1,Pclass_2,Pclass_3
0,1,1,0.273456,1,0,0.014151,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,2,0,0.473882,1,0,0.139136,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,0,0.323563,0,0,0.015469,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,4,0,0.436302,1,0,0.103644,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,5,1,0.436302,0,0,0.015713,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,1,0.372180,0,0,0.015713,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1305,1306,0,0.486409,0,0,0.212559,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1306,1307,1,0.480145,0,0,0.014151,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1307,1308,1,0.372180,0,0,0.015713,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Columns: 1128 entries, PassengerId to Pclass_3
dtypes: float64(2), int64(4), uint8(1122)
memory usage: 1.5 MB


In [20]:
train_x = df.loc[df["PassengerId"] <= 891].reset_index(drop=True)
test = df.loc[df["PassengerId"] > 891].reset_index(drop=True)

In [21]:
train_x.to_csv("train_x.csv", index=False)
train_y.to_csv("train_y.csv", index=False)
test.to_csv("test_x.csv", index=False)

Unnamed: 0,PassengerId,Survived
0,1,0
1,2,1
2,3,1
3,4,1
4,5,0
...,...,...
886,887,0
887,888,1
888,889,0
889,890,1
