# Imputation

Distributions: Uniform, Normal

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("people.csv", usecols=["Name", "Sex", "Age"])
df.head(10)

## Display the rows with missing ages.

In [None]:
df.loc[df["Age"].isnull()]

---

### Mean Age

In [None]:
df["Age"].mean()

---

### Male Mean Age

In [None]:
df.loc[df["Sex"] == "male", "Age"].mean()

### Female Mean Age

In [None]:
df.loc[df["Sex"] == "female", "Age"].mean()

---

### The number of males with missing ages

In [None]:
len(df.loc[(df["Sex"] == "male") & (df["Age"].isnull()), "Age"])

---

# Impute missing male ages

## Get the mean and standard deviation of "Age" for males

In [None]:
male_age_mean = df.loc[df['Sex']=="male", 'Age'].mean()
male_age_std = df.loc[df['Sex']=="male", 'Age'].std()

male_age_mean, male_age_std

## Randomly selected age from a uniform distribution

In [None]:
# Use the mean and std of males
# Randomly generate an Age within 1 standard deviation of the mean

# Ages must fall between 16 and 45 (within 1 std of mean)
round(np.random.uniform(male_age_mean - male_age_std, male_age_mean + male_age_std))

### Generating 10 randomly selected ages from a uniform distribution within 1 standard deviation of the mean. 

In [None]:
# Ages must fall between 16 and 45 (within 1 std of mean)

for _ in range(10):
    male_age_impute = round(np.random.uniform(male_age_mean - male_age_std, male_age_mean + male_age_std))
    print(male_age_impute)

---

## Randomly selected age from a normal distribution

In [None]:
# Randomly generate an Age from the full range of values from a normal distribution: 
# (0-1:68%, 1-2:27%, 2-3:5%, 3+:.3%)

round(np.random.normal(male_age_mean, male_age_std))


### Generating 20 randomly selected ages from a normal distribution using the males's mean and standard deviation.

In [None]:
for _ in range(20):
    male_age_impute = round(np.random.normal(male_age_mean, male_age_std))
    print(male_age_impute)

---

# Create an "impute_age" function

In [None]:
def impute_age(Age_mean, Age_std):
    
    min_age = 1
    max_age = 110
    
    Age_impute = round(np.random.normal(Age_mean, Age_std))
    
    if Age_impute < min_age:
        return min_age
    if Age_impute > max_age:
        return max_age
    
    return Age_impute

## Impute the missing ages for males drawing from a normal distribution

In [None]:
# optional; used to append the index of the row that we're imputing so that we can view it afterwards
imputed_ages = []

for index, row in df.loc[(df['Sex']=="male") & (df["Age"].isnull())].iterrows():
    df.loc[index, "Age"] = impute_age(male_age_mean, male_age_std)
    
    
    # optional; append the row index of the missing value to a list so that we can view it after imputation
    imputed_ages.append(index)
    # optional; is only used to display the generated numbers
    print(df.loc[index, "Age"])  

## Display imputed ages

In [None]:
df.iloc[imputed_ages]

### Male mean age before and after imputation

In [None]:
print(f"Mean age before imputation: {male_age_mean}")
print(f"Mean age after imputation: {df.loc[df['Sex']=='male', 'Age'].mean()}")

---

### The number of males with missing ages after imputation

In [None]:
# All of the missing ages for males have been imputed

len(df.loc[(df["Sex"] == "male") & (df["Age"].isnull()), "Age"])

---