## Import libraries

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## import dataset

In [25]:
df_titanic = pd.read_csv('titanic.csv')
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  888 non-null    int64  
 1   Survived     888 non-null    int64  
 2   Pclass       888 non-null    float64
 3   Name         888 non-null    object 
 4   Sex          888 non-null    int64  
 5   Age          715 non-null    float64
 6   SibSp        888 non-null    int64  
 7   Parch        887 non-null    float64
 8   Ticket       887 non-null    object 
 9   Fare         886 non-null    float64
 10  Embarked     886 non-null    float64
dtypes: float64(5), int64(4), object(2)
memory usage: 76.4+ KB


## Handling missing values with removal

In [26]:
df_titanic.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            173
SibSp            0
Parch            1
Ticket           1
Fare             2
Embarked         2
dtype: int64

We can see that age, ticket, fare, cabin and embarked columns have missing values. 

### Age column

In [27]:
df_titanic.shape

(888, 11)

In [28]:
df = df_titanic.copy()

In [29]:
df.Age.isna().sum()

173

In [30]:
df.dropna(subset='Age', how='any', inplace=True)
df.Age.isna().sum()

0

### Parch column

In [31]:
df['Parch'].isna().sum()

1

In [32]:
df.dropna(subset='Parch', how='any', inplace=True)
df['Parch'].isna().sum()

0

### Ticket column


In [33]:
df['Ticket'].isna().sum()

1

In [34]:
df.dropna(subset='Ticket', how='any', inplace=True)
df['Ticket'].isna().sum()

0

### Fare Column

In [35]:
df['Fare'].isna().sum()

2

In [36]:
df.dropna(subset='Fare', how='any', inplace=True)
df['Fare'].isna().sum()

0

### Embarked Column

In [37]:
df['Embarked'].isna().sum()

2

In [38]:
df.dropna(subset='Embarked', how='any', inplace=True)
df['Embarked'].isna().sum()

0

In [39]:
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [40]:
df.shape

(709, 11)

In [41]:
len(df)/len(df_titanic)*100

79.84234234234235

In [42]:
mv = (888-709)/888*100

In [43]:
print(f'missing values: {mv:.2f}%')

missing values: 20.16%


After removing the missing values, 79.84% values are remained in dataframe.

In [44]:
df = df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
df.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3.0,1,22.0,1,0.0,7.25,1.0
1,1,1.0,2,38.0,1,0.0,71.28,2.0
2,1,3.0,2,26.0,0,0.0,7.92,1.0


In [45]:
# saving the dataframe
df.to_csv('titanic7_FMV_drop.csv', index=False)