## Import libraries

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## import dataset

In [60]:
df_titanic = pd.read_csv('titanic.csv')
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  888 non-null    int64  
 1   Survived     888 non-null    int64  
 2   Pclass       888 non-null    float64
 3   Name         888 non-null    object 
 4   Sex          888 non-null    int64  
 5   Age          715 non-null    float64
 6   SibSp        888 non-null    int64  
 7   Parch        887 non-null    float64
 8   Ticket       887 non-null    object 
 9   Fare         886 non-null    float64
 10  Embarked     886 non-null    float64
dtypes: float64(5), int64(4), object(2)
memory usage: 76.4+ KB


## Handling missing values with random values (mean and standard deviation)

In [61]:
df_titanic.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            173
SibSp            0
Parch            1
Ticket           1
Fare             2
Embarked         2
dtype: int64

### Age column

In [62]:
df_titanic['Age'].isna().sum()

173

In [63]:
def fill_missing_with_random(df, column_name):
    mean = df_titanic['Age'].mean()
    std = df_titanic['Age'].std()
    num_missing = df_titanic['Age'].isnull().sum()
    random_values = np.random.normal(mean, std, num_missing)
    df.loc[df_titanic['Age'].isnull(), column_name] = random_values
    
fill_missing_with_random(df_titanic,'Age')

In [64]:
df_titanic['Age'].isna().sum()

0

### Parch column

In [65]:
df_titanic['Parch'].isna().sum()

1

In [66]:
def fill_missing_with_random(df, column_name):
    mean = df_titanic['Parch'].mean()
    std = df_titanic['Parch'].std()
    num_missing = df_titanic['Parch'].isnull().sum()
    random_values = np.random.normal(mean, std, num_missing)
    df.loc[df_titanic['Parch'].isnull(), column_name] = random_values
    
fill_missing_with_random(df_titanic, 'Parch')

In [67]:
df_titanic['Parch'].isna().sum()

0

### Ticket column


In [68]:
df_titanic['Ticket'].isna().sum()

1

In [69]:
df_titanic['Ticket'].fillna(df_titanic['Ticket'].mode()[0], inplace = True)
df_titanic['Ticket'].isna().sum()

0

### Fare Column

In [70]:
df_titanic['Fare'].isna().sum()

2

In [71]:
def fill_missing_with_random(df, column_name):
    mean = df_titanic['Fare'].mean()
    std = df_titanic['Fare'].std()
    num_missing = df_titanic['Fare'].isnull().sum()
    random_values = np.random.normal(mean, std, num_missing)
    df.loc[df_titanic['Fare'].isnull(), column_name] = random_values
    
fill_missing_with_random(df_titanic, 'Fare')

In [72]:
df_titanic['Fare'].isna().sum()

0

### Embarked Column

In [73]:
df_titanic['Embarked'].isna().sum()

2

In [74]:
df_titanic['Embarked'].fillna(df_titanic['Embarked'].mode()[0], inplace = True)
df_titanic.Embarked.isna().sum()

0

In [75]:
df_titanic = df_titanic.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
df_titanic.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3.0,1,22.0,1,0.0,7.25,1.0
1,1,1.0,2,38.0,1,0.0,71.28,2.0
2,1,3.0,2,26.0,0,0.0,7.92,1.0


In [76]:
# saving the dataframe
df_titanic.to_csv('titanic4_FMV_randomMeanStd.csv', index=False)