# Handling Numerical Data

In [None]:
import numpy as np
import pandas as pd

### Diabetes dataset

In [None]:
df = pd.read_csv("diabetes.csv")
df.head()

### Check the size of the dataset

In [None]:
df.shape

### Get basic information about the dataset

In [None]:
# No missing values in dataset

df.info()

---

# Identify bad values

In [None]:
# Note the min values

df.describe()

### Evaluate "insulin"

In [None]:
# View first 10 rows

df["insulin"].head(10)

---

# Identify potential outliers

## Visualize Outliers
### Boxplot

In [None]:
import matplotlib.pyplot as plt

fig, (left, right) = plt.subplots(1,2, figsize = (10, 5))

left.boxplot(df["insulin"], labels = ["insulin"])
right.boxplot(df["num_preg"], labels = ["number of pregnancies"]);

## Identify the rows with potential outliers

In [None]:
# Function returns index (row) of outliers (beyond 1.5 of the IQR from the 1st or 3rd quartile)

def indicies_of_outliers(column):
    q1, q3 = np.percentile(column, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((column > upper_bound) | (column < lower_bound))[0]

#### Rows with outliers for insulin

In [None]:
# Call the indicies_of_outliers() function, passing it the insulin column (values)

insulin_outliers = indicies_of_outliers(df["insulin"])
insulin_outliers

#### Rows with outliers for num_preg

In [None]:
# Call the indicies_of_outliers() function, passing it the num_preg values

preg_outliers = indicies_of_outliers(df["num_preg"])
preg_outliers

## Inspect the rows with potential outliers

### Insulin

In [None]:
# View the rows with potential outliers

df.iloc[insulin_outliers]  # view all columns

### Number of pregnancies

In [None]:
# View the rows with potential outliers

df.iloc[preg_outliers, [0,1]]  # view only the first (index 0) and second (index 1) columns

### Dropping the outliers

In [None]:
# You could potentially choose to drop all of the rows that are outliers

#df = df.drop(preg_outliers)

---

# Replace bad numerical values

#### The benefit of setting bad numerical values to NaN for imputation

### Evaluate "insulin"

In [None]:
# View first 10 rows

df["insulin"].head(10)

### View descriptive statistics

In [None]:
df["insulin"].describe()

### Set bad "insulin" values to NAN

#### Replace a single value with np.nan (Not a Number)

In [None]:
df["insulin"] = df["insulin"].replace(0, np.nan)

df["insulin"].head(10)

#### Replace a range of values with np.nan (Not a Number)

In [None]:
# Example

#df.loc[df["insulin"] < 30, "insulin"] = np.nan

### View descriptive statistics
#### The result of replacing bad values with NaN

In [None]:
df["insulin"].describe()

---

# Drop missing values
If you're in a hurry or have plenty of data and don't mind losing some data, you can just drop the rows or columns that contain missing values.

### Remove all of the rows that contain a missing value

In [None]:
df = df.dropna()

df.head()

In [None]:
df.shape

<div class="alert alert-block alert-warning">
<b>Alert:</b> Be careful, dropping data is often the least favorable option because you are losing information (from the intact columns) that could benefit machine learning, and you may add some bias depending upon why the data was missing.</div>

---

# Fill in (impute) missing values

### Reset the data

In [None]:
df = pd.read_csv("diabetes.csv")

# set 0 to NaN
df["insulin"] = df["insulin"].replace(0, np.nan)

df["insulin"].head(10)

### View descriptive statistics

In [None]:
df["insulin"].describe()

### Use fillna() to impute the missing values

In [None]:
# Set null (NaN) insulin values to the mean insulin value

df["insulin"] = df["insulin"].fillna(df["insulin"].mean())
df["insulin"].head(10)

---

# More Refined Imputation

### Reset the data

In [None]:
df = pd.read_csv("diabetes.csv")

# set 0 to NaN
df["insulin"] = df["insulin"].replace(0, np.nan)

df["insulin"].head(10)

## Evaluate age

In [None]:
# Approximately 75% of individuals are under 40 years old

df['age'].describe()

## Obtain mean insulin value by age range

In [None]:
# Mean insulin level under 41

df.loc[df["age"] <= 40, "insulin"].mean()

In [None]:
# Mean insulin level over 40

df.loc[df["age"] > 40, "insulin"].mean()

## Impute the bad values

In [None]:
# Impute missing insulin values for samples under 41 years old

df.loc[(df["age"] <= 40) & (df["insulin"].isnull()), "insulin"] = df.loc[df["age"] <= 40, "insulin"].mean()

In [None]:
# Impute missing insulin values for samples over 40 years old

df.loc[(df["age"] > 40) & (df["insulin"].isnull()), "insulin"] = df.loc[df["age"] > 40, "insulin"].mean()

#### View the imputation

In [None]:
df[["age", "insulin"]].head(10)

---