In [1]:
import pandas as pd
from pandas import Series, DataFrame

s = pd.read_csv('../data/nyc-temps.txt').squeeze()
df = DataFrame({'temp': s, 
                'hour': [0,3,6,9,12,15,18,21] * 91})

df.loc[(df['hour'] == 3) | (df['hour'] == 6), 'temp'] = NaN

NameError: name 'NaN' is not defined

# Beyond 1

By default, the `interpolate` method tries to average the remaining values before and after any `NaN`. However, we can change how it works, by passing `method='nearest'`. Does that change our data substantially?

In [None]:
# No, doesn't seem to change things significantly -- maybe because temperatures
# don't really vary all that much across readings.

df.interpolate(method='nearest').describe()

# Beyond 2

Let's assume that the equipment works fine around the clock, but that it fails to record a reading at -1 degree and below. Are the interpolated values similar to the real (missing) values they replace? Why or why not?

In [None]:
# reset our data
df = DataFrame({'temp': s, 
                'hour': [0,3,6,9,12,15,18,21] * 91})

In [None]:
# Remove values <= -1 to NaN
df.loc[df['temp'] <= -1, 'temp'] = NaN

In [None]:
# Interpolate!
df = df.interpolate()

In [None]:
# Wow, the mean is now 2 and the median is now 1 -- significantly higher
# Not surprising, of course, given that we removed all very low temperatures!
df['temp'].describe()

# Beyond 3

A cheap solution to interpolation is to replace `NaN` values with the column's mean. Do this, and compare the new mean and median. Again, why are (or aren't) these values similar to the original ones?

In [None]:
# reset our data
df = DataFrame({'temp': s, 
                'hour': [0,3,6,9,12,15,18,21] * 91})

# Remove values <= -1 to NaN
df.loc[df['temp'] <= -1, 'temp'] = NaN

df = df.fillna(df.mean())

In [None]:
# Wow, these values are even worse than the interpolated ones!

# Clearly, running .interpolate is a better option than using the mean -- 
# in no small part because it calculated a local mean, rather than
# a global one across all of the data.

df.describe()