## Imports and fetching data

In [None]:
import pandas as pd
import numpy as np
import re

In [10]:
df = pd.read_csv("titanic.csv")

## Exploring data

In [11]:
df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [13]:
df["Age"].isnull().sum()


263

## Fixing age as Int

In [14]:
# chekcing NaN for age

ageNa = df["Age"].isnull().sum()

print(f"Ammount of Nan Values in Age Series: {ageNa}")

Ammount of Nan Values in Age Series: 263


In [22]:
# filling age na with -1
df["Age"] = df["Age"].fillna(-1)

# casting type
df["Age"] = df["Age"].astype("int")

In [18]:
# Checking that fillna worked
ageNa = df["Age"].isnull().sum()

print(f"Ammount of Nan Values in Age Series after fillna: {ageNa}")

Ammount of Nan Values in Age Series after fillna: 0


In [23]:
# Confirming that the typecast worked
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1309 non-null   int64  
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Ticket       1309 non-null   object 
 8   Fare         1308 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1307 non-null   object 
dtypes: float64(1), int64(5), object(5)
memory usage: 112.6+ KB


# Feature engineering text data

In [26]:
# Casting to lowercase to make text easier to work with

df["Name"] = df["Name"].apply(lambda x: str.lower(x))

In [28]:
def title_extractor(name: str) -> str:
    title = re.search(" [a-z]*\.", name).group(0)
    title = re.sub("\.", "", title)
    return title

In [29]:
df["Name"]

0                                 braund, mr. owen harris
1       cumings, mrs. john bradley (florence briggs th...
2                                  heikkinen, miss. laina
3            futrelle, mrs. jacques heath (lily may peel)
4                                allen, mr. william henry
                              ...                        
1304                                   spector, mr. woolf
1305                         oliva y ocana, dona. fermina
1306                         saether, mr. simon sivertsen
1307                                  ware, mr. frederick
1308                             peter, master. michael j
Name: Name, Length: 1309, dtype: object

In [32]:
df["Title"] =  df["Name"].apply(lambda x: title_extractor(x))

In [33]:
df["Title"].value_counts()

 mr          757
 miss        260
 mrs         197
 master       61
 rev           8
 dr            8
 col           4
 mlle          2
 major         2
 ms            2
 lady          1
 sir           1
 mme           1
 don           1
 capt          1
 countess      1
 jonkheer      1
 dona          1
Name: Title, dtype: int64

## First Name extraction

In [None]:
def first_name_extractor(name: str) -> str:
    first_name = re.search(r"^[a-z]*\,", name)
    
    try:
        first_name = first_name.group(0)
        first_name = re.sub(r"\,", "", first_name)
    except:
        first_name = "Undefined"

    return first_name

In [None]:
def last_name_extractor(name: str) -> str:
    last_name = re.search(r"\. [a-z]*", name)
    
    try:
        last_name = last_name.group(0)
        last_name = re.sub(r"^\. ", "", last_name)
    except:
        last_name = "Undefined"

    return last_name

In [None]:
df["First Name"] = 

In [None]:
df["Last Name"] = 

# Making ordinal categories for age (usefull for classification later on)

In [None]:
def age_to_ordinal(age: int) -> int:
    if age < 26:

In [None]:
df["Age Category"] = 

In [None]:
# Just looking at age categorie to see effect
df["Age Category"].value_counts()

In [None]:
# Fancy bonus plot if you want to
df["Age Category"]

In [None]:
df

## Dealing with rest of NaN
Since ish 80% of cabin is nan, it is doesn't make sense to drop it, thusly just filling rest of dataframe with Undefined for NaN values, however do not do this to deal with NaN in numerical columns (Series in panda language) as this would change the datatype (then you could use -1 instead as shown above)

In [None]:
# Filling Fare NaN float value of -1.0
df["Fare"] = 

# Filling NaN in the two remaining string columngs (thought this hits entire df)
df = 

In [None]:
df.isna().sum()

In [None]:
df