In [1]:
import pandas as pd
import numpy as np

In [2]:
titanic_df = pd.read_csv("titanic.csv")

In [3]:
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
titanic_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
# Fill in missing "Age" values
titanic_df["Age"].fillna(titanic_df["Age"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_df["Age"].fillna(titanic_df["Age"].mean(), inplace=True)


In [6]:
# Delete rows with misisng "Embarked" values
criteria = titanic_df["Embarked"].isnull()
titanic_df = titanic_df[~criteria]

In [7]:
# Dropping irrelevant columns
titanic_df = titanic_df.drop(["Cabin", "Ticket", "PassengerId", "SibSp", "Parch"], axis=1)

In [8]:
titanic_df["Sex"].isnull().sum()

0

In [9]:
from sklearn.preprocessing import OrdinalEncoder

# Encoding "Age" values
encoder = OrdinalEncoder(categories=[['female', 'male']])

titanic_df[["Sex"]] = encoder.fit_transform(titanic_df[["Sex"]])

In [10]:
titanic_df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",1.0,22.000000,7.2500,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,38.000000,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",0.0,26.000000,7.9250,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,35.000000,53.1000,S
4,0,3,"Allen, Mr. William Henry",1.0,35.000000,8.0500,S
...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",1.0,27.000000,13.0000,S
887,1,1,"Graham, Miss. Margaret Edith",0.0,19.000000,30.0000,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0.0,29.699118,23.4500,S
889,1,1,"Behr, Mr. Karl Howell",1.0,26.000000,30.0000,C


In [11]:
titanic_df["Embarked"].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [12]:
from sklearn.preprocessing import OneHotEncoder

# Encoding "Embarked" values
encoder = OneHotEncoder()

embarked = encoder.fit_transform(titanic_df[["Embarked"]]).toarray()

titanic_df[["C", "S", "Q"]] = embarked


In [13]:
titanic_df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,C,S,Q
0,0,3,"Braund, Mr. Owen Harris",1.0,22.000000,7.2500,S,0.0,0.0,1.0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,38.000000,71.2833,C,1.0,0.0,0.0
2,1,3,"Heikkinen, Miss. Laina",0.0,26.000000,7.9250,S,0.0,0.0,1.0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,35.000000,53.1000,S,0.0,0.0,1.0
4,0,3,"Allen, Mr. William Henry",1.0,35.000000,8.0500,S,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",1.0,27.000000,13.0000,S,0.0,0.0,1.0
887,1,1,"Graham, Miss. Margaret Edith",0.0,19.000000,30.0000,S,0.0,0.0,1.0
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0.0,29.699118,23.4500,S,0.0,0.0,1.0
889,1,1,"Behr, Mr. Karl Howell",1.0,26.000000,30.0000,C,1.0,0.0,0.0


In [14]:
# Combining the csv files
titanic_df.to_csv("processed.csv", index=False)