In [51]:
import numpy as np
import pandas as pd
import os

In [52]:
dataset_name = 'titanic'

In [53]:
input_dir = './raw'
inp_fname = 'titanic_full_data.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

# Read Data

In [54]:
col_names = [
    "pclass",
    "survived",
    "name",
    "sex",
    "age",
    "sibsp",
    "parch",
    "ticket",
    "fare",
    "cabin",
    "embarked",
    "boat",
    "body",
    "home.dest",
]

In [55]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None, names=col_names)
print(data.shape)
data.head()

(1309, 14)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"


In [56]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [57]:
# We will create our own unique id column instead of using the 'name' field because
# there are two pairs of same-named passengers: Kelly, Mr. James and Connolly, Miss. Kate
id_col = "id"
target_col = "survived"

# Replace '?" with null

In [58]:
data = data.replace('?', np.nan)

In [59]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1046 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1308 non-null   object
 9   cabin      295 non-null    object
 10  embarked   1307 non-null   object
 11  boat       486 non-null    object
 12  body       121 non-null    object
 13  home.dest  745 non-null    object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


# Insert Id Column

In [60]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

data[id_col] = data[id_col].astype(str)

   id  pclass  survived                                             name  \
0   0       1         1                    Allen, Miss. Elisabeth Walton   
1   1       1         1                   Allison, Master. Hudson Trevor   
2   2       1         0                     Allison, Miss. Helen Loraine   
3   3       1         0             Allison, Mr. Hudson Joshua Creighton   
4   4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)   

      sex     age  sibsp  parch  ticket      fare    cabin embarked boat body  \
0  female      29      0      0   24160  211.3375       B5        S    2  NaN   
1    male  0.9167      1      2  113781    151.55  C22 C26        S   11  NaN   
2  female       2      1      2  113781    151.55  C22 C26        S  NaN  NaN   
3    male      30      1      2  113781    151.55  C22 C26        S  NaN  135   
4  female      25      1      2  113781    151.55  C22 C26        S  NaN  NaN   

                         home.dest  
0                  

# Drop Unneeded Columns

In [61]:
data.drop(labels=["name", "boat", "body", "home.dest", "ticket", "cabin"], axis=1, inplace=True)

In [62]:
print(data.shape)
data.head()

(1309, 9)


Unnamed: 0,id,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,0,1,1,female,29.0,0,0,211.3375,S
1,1,1,1,male,0.9167,1,2,151.55,S
2,2,1,0,female,2.0,1,2,151.55,S
3,3,1,0,male,30.0,1,2,151.55,S
4,4,1,0,female,25.0,1,2,151.55,S


# Save Main Data File

In [64]:
data.to_csv(outp_fname, index=False)