In [1]:
import pandas as pd
import numpy as np

In [3]:
# Read in the data file: df
df = pd.read_csv('./data/NOAA_QCLCD_2011_hourly_13904.txt')

In [4]:
# As this is a real a data, it is quite messy. There is no column names and index.
df.head()

Unnamed: 0,13904,20110101,0053,12,OVC045,Unnamed: 6,10.00,.1,.2,.3,...,.18,.19,29.95,.20,AA,.21,.22,.23,29.95.1,.24
0,13904,20110101,153,12,OVC049,,10.0,,,,...,,,30.01,,AA,,,,30.02,
1,13904,20110101,253,12,OVC060,,10.0,,,,...,30.0,,30.01,,AA,,,,30.02,
2,13904,20110101,353,12,OVC065,,10.0,,,,...,,,30.03,,AA,,,,30.04,
3,13904,20110101,453,12,BKN070,,10.0,,,,...,,,30.04,,AA,,,,30.04,
4,13904,20110101,553,12,BKN065,,10.0,,,,...,15.0,,30.06,,AA,,,,30.06,


In [6]:
# If we use the attribute `header=None` the data frame gets a more comfortable
# format to work with.
df = pd.read_csv('./data/NOAA_QCLCD_2011_hourly_13904.txt', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,13904,20110101,53,12,OVC045,,10.0,,,,...,,,29.95,,AA,,,,29.95,
1,13904,20110101,153,12,OVC049,,10.0,,,,...,,,30.01,,AA,,,,30.02,
2,13904,20110101,253,12,OVC060,,10.0,,,,...,30.0,,30.01,,AA,,,,30.02,
3,13904,20110101,353,12,OVC065,,10.0,,,,...,,,30.03,,AA,,,,30.04,
4,13904,20110101,453,12,BKN070,,10.0,,,,...,,,30.04,,AA,,,,30.04,


In [32]:
# A comma separated file contains the columns labels. We read it into a a comma 
# separate string.

column_labels = open('./data/NOAA_QCLCD_2011_hourly_13904_column_labels.txt', newline='').read().strip()
column_labels

# The `.strip()` is used for each line of the file to remove `\n` newline
# character that each line might have.

'Wban,date,Time,StationType,sky_condition,sky_conditionFlag,visibility,visibilityFlag,wx_and_obst_to_vision,wx_and_obst_to_visionFlag,dry_bulb_faren,dry_bulb_farenFlag,dry_bulb_cel,dry_bulb_celFlag,wet_bulb_faren,wet_bulb_farenFlag,wet_bulb_cel,wet_bulb_celFlag,dew_point_faren,dew_point_farenFlag,dew_point_cel,dew_point_celFlag,relative_humidity,relative_humidityFlag,wind_speed,wind_speedFlag,wind_direction,wind_directionFlag,value_for_wind_character,value_for_wind_characterFlag,station_pressure,station_pressureFlag,pressure_tendency,pressure_tendencyFlag,presschange,presschangeFlag,sea_level_pressure,sea_level_pressureFlag,record_type,hourly_precip,hourly_precipFlag,altimeter,altimeterFlag,junk'

In [33]:
# Splitting to create a list of columns labels.

column_labels = column_labels.split(',')
column_labels

['Wban',
 'date',
 'Time',
 'StationType',
 'sky_condition',
 'sky_conditionFlag',
 'visibility',
 'visibilityFlag',
 'wx_and_obst_to_vision',
 'wx_and_obst_to_visionFlag',
 'dry_bulb_faren',
 'dry_bulb_farenFlag',
 'dry_bulb_cel',
 'dry_bulb_celFlag',
 'wet_bulb_faren',
 'wet_bulb_farenFlag',
 'wet_bulb_cel',
 'wet_bulb_celFlag',
 'dew_point_faren',
 'dew_point_farenFlag',
 'dew_point_cel',
 'dew_point_celFlag',
 'relative_humidity',
 'relative_humidityFlag',
 'wind_speed',
 'wind_speedFlag',
 'wind_direction',
 'wind_directionFlag',
 'value_for_wind_character',
 'value_for_wind_characterFlag',
 'station_pressure',
 'station_pressureFlag',
 'pressure_tendency',
 'pressure_tendencyFlag',
 'presschange',
 'presschangeFlag',
 'sea_level_pressure',
 'sea_level_pressureFlag',
 'record_type',
 'hourly_precip',
 'hourly_precipFlag',
 'altimeter',
 'altimeterFlag',
 'junk']

In [34]:
# Assigning the `column_labels` list as data frame column labels.

df.columns = column_labels
df.head()

Unnamed: 0,Wban,date,Time,StationType,sky_condition,sky_conditionFlag,visibility,visibilityFlag,wx_and_obst_to_vision,wx_and_obst_to_visionFlag,...,presschange,presschangeFlag,sea_level_pressure,sea_level_pressureFlag,record_type,hourly_precip,hourly_precipFlag,altimeter,altimeterFlag,junk
0,13904,20110101,53,12,OVC045,,10.0,,,,...,,,29.95,,AA,,,,29.95,
1,13904,20110101,153,12,OVC049,,10.0,,,,...,,,30.01,,AA,,,,30.02,
2,13904,20110101,253,12,OVC060,,10.0,,,,...,30.0,,30.01,,AA,,,,30.02,
3,13904,20110101,353,12,OVC065,,10.0,,,,...,,,30.03,,AA,,,,30.04,
4,13904,20110101,453,12,BKN070,,10.0,,,,...,,,30.04,,AA,,,,30.04,


In [49]:
# Other file provides some labels that should be dropped for this activity. This
# file is quite different from the column labels one , so we use, even for 
# illustration purposes, another approach to read.

list_to_drop = []
with open('./data/NOAA_QCLCD_2011_hourly_13904_dropping.txt') as file:
    for line in file:
        line = line.strip() #or some other preprocessing.
        list_to_drop.append(line)
        
# The `.strip()` is used for each line of the file to remove `\n` newline
# character that each line might have.
        
list_to_drop

['sky_conditionFlag',
 'visibilityFlag',
 'wx_and_obst_to_vision',
 'wx_and_obst_to_visionFlag',
 'dry_bulb_farenFlag',
 'dry_bulb_celFlag',
 'wet_bulb_farenFlag',
 'wet_bulb_celFlag',
 'dew_point_farenFlag',
 'dew_point_celFlag',
 'relative_humidityFlag',
 'wind_speedFlag',
 'wind_directionFlag',
 'value_for_wind_character',
 'value_for_wind_characterFlag',
 'station_pressureFlag',
 'pressure_tendencyFlag',
 'pressure_tendency',
 'presschange',
 'presschangeFlag',
 'sea_level_pressureFlag',
 'hourly_precip',
 'hourly_precipFlag',
 'altimeter',
 'record_type',
 'altimeterFlag',
 'junk']

In [50]:
# Dropping the columns.

df = df.drop(list_to_drop, axis='columns')
df.head()

Unnamed: 0,Wban,date,Time,StationType,sky_condition,visibility,dry_bulb_faren,dry_bulb_cel,wet_bulb_faren,wet_bulb_cel,dew_point_faren,dew_point_cel,relative_humidity,wind_speed,wind_direction,station_pressure,sea_level_pressure
0,13904,20110101,53,12,OVC045,10.0,51,10.6,38,3.1,15,-9.4,24,15,360,29.42,29.95
1,13904,20110101,153,12,OVC049,10.0,51,10.6,37,3.0,14,-10.0,23,10,340,29.49,30.01
2,13904,20110101,253,12,OVC060,10.0,51,10.6,37,2.9,13,-10.6,22,15,10,29.49,30.01
3,13904,20110101,353,12,OVC065,10.0,50,10.0,38,3.1,17,-8.3,27,7,350,29.51,30.03
4,13904,20110101,453,12,BKN070,10.0,50,10.0,37,2.8,15,-9.4,25,11,20,29.51,30.04


In [56]:
# Cleaning and tidying datetime data.

# Now we want to clean up the date and Time columns and combine them into 
# a datetime collection to be used as the Index.

# First, we convert the date column to a string.
df['date'] = df['date'].astype(str)

# Pad leading zeros to the Time column: df_dropped['Time']
df['Time'] = df['Time'].apply(lambda x:'{:0>4}'.format(x))

# Concatenate the new date and Time columns.
date_times = df['date'] + df['Time']

# Convert it to a pandas time series (set the format)
date_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')

# Set the index to be the new date_times pandas series.
df = df.set_index(date_times)

df.head()

Unnamed: 0,Wban,date,Time,StationType,sky_condition,visibility,dry_bulb_faren,dry_bulb_cel,wet_bulb_faren,wet_bulb_cel,dew_point_faren,dew_point_cel,relative_humidity,wind_speed,wind_direction,station_pressure,sea_level_pressure
2011-01-01 00:53:00,13904,20110101,53,12,OVC045,10.0,51,10.6,38,3.1,15,-9.4,24,15,360,29.42,29.95
2011-01-01 01:53:00,13904,20110101,153,12,OVC049,10.0,51,10.6,37,3.0,14,-10.0,23,10,340,29.49,30.01
2011-01-01 02:53:00,13904,20110101,253,12,OVC060,10.0,51,10.6,37,2.9,13,-10.6,22,15,10,29.49,30.01
2011-01-01 03:53:00,13904,20110101,353,12,OVC065,10.0,50,10.0,38,3.1,17,-8.3,27,7,350,29.51,30.03
2011-01-01 04:53:00,13904,20110101,453,12,BKN070,10.0,50,10.0,37,2.8,15,-9.4,25,11,20,29.51,30.04


In [59]:
# The numeric columns contain missing values labeled as `'M'`. Now we 
# are going to transform these columns such that they contain only numeric values
# and interpret missing data as `NaN`.

# The pandas function `pd.to_numeric()` is ideal for this purpose: It converts 
# a Series of values to floating-point values. Furthermore, by specifying the 
# keyword argument `errors='coerce'`, we can force strings like `'M'` to be 
# interpreted as `NaN.`

# Lets take a look at the `dry_bulb_faren` temperature between 8 AM and 9 AM 
# on June 20, 2011.

df.loc['2011-JUN-20 08:00:00':'2011-JUN-20 09:00:00', 'dry_bulb_faren']

2011-06-20 08:27:00     M
2011-06-20 08:28:00     M
2011-06-20 08:29:00     M
2011-06-20 08:30:00     M
2011-06-20 08:31:00     M
2011-06-20 08:32:00     M
2011-06-20 08:33:00     M
2011-06-20 08:34:00     M
2011-06-20 08:35:00     M
2011-06-20 08:53:00    83
Name: dry_bulb_faren, dtype: object

In [66]:
# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']
df['dry_bulb_faren'] = pd.to_numeric(df['dry_bulb_faren'], errors='coerce')

# The transformed `dry_bulb_faren` temperature between 8 AM and 9 AM on June 20, 2011.
df.loc['2011-JUN-20 08:00:00':'2011-JUN-20 09:00:00', 'dry_bulb_faren']

2011-06-20 08:27:00     NaN
2011-06-20 08:28:00     NaN
2011-06-20 08:29:00     NaN
2011-06-20 08:30:00     NaN
2011-06-20 08:31:00     NaN
2011-06-20 08:32:00     NaN
2011-06-20 08:33:00     NaN
2011-06-20 08:34:00     NaN
2011-06-20 08:35:00     NaN
2011-06-20 08:53:00    83.0
Name: dry_bulb_faren, dtype: float64

In [67]:
# We are also going to convert `wind_speed` and `dew_point_faren` columns to numeric
# values.

df['wind_speed'] = pd.to_numeric(df['wind_speed'], errors='coerce')
df['dew_point_faren'] = pd.to_numeric(df['dew_point_faren'], errors='coerce')