# Pandas - Air Quality Data
Today we will be working with air quality data.

In [1]:
import pandas as pd

In [5]:
df = pd.read_csv('LaqnData.csv')
df.head(5)

Unnamed: 0,Site,Species,ReadingDateTime,Value,Units,Provisional or Ratified
0,CT3,NO,01/01/2017 00:00,3.5,ug m-3,R
1,CT3,NO,01/01/2017 01:00,3.6,ug m-3,R
2,CT3,NO,01/01/2017 02:00,2.2,ug m-3,R
3,CT3,NO,01/01/2017 03:00,2.1,ug m-3,R
4,CT3,NO,01/01/2017 04:00,3.3,ug m-3,R



# Drop a column

In [6]:
# first we would like to check if all of the columns are necessary.

print(df.Site.unique())
print(df.Species.unique())
print(df.ReadingDateTime.unique())
print(df.Value.unique())
print(df.Units.unique())
print(df['Provisional or Ratified'].unique())

['CT3']
['NO' 'NO2' 'NOX' 'PM10' 'PM2.5']
['01/01/2017 00:00' '01/01/2017 01:00' '01/01/2017 02:00' ...
 '31/12/2017 21:00' '31/12/2017 22:00' '31/12/2017 23:00']
[ 3.5  3.6  2.2 ... -7.  -8.  -6. ]
['ug m-3' 'ug m-3 as NO2' 'ug/m3']
['R']


In [7]:
df.drop(columns=['Site','Provisional or Ratified'])

Unnamed: 0,Species,ReadingDateTime,Value,Units
0,NO,01/01/2017 00:00,3.5,ug m-3
1,NO,01/01/2017 01:00,3.6,ug m-3
2,NO,01/01/2017 02:00,2.2,ug m-3
...,...,...,...,...
43797,PM2.5,31/12/2017 21:00,5.0,ug m-3
43798,PM2.5,31/12/2017 22:00,5.0,ug m-3
43799,PM2.5,31/12/2017 23:00,6.0,ug m-3


In [8]:
df

Unnamed: 0,Site,Species,ReadingDateTime,Value,Units,Provisional or Ratified
0,CT3,NO,01/01/2017 00:00,3.5,ug m-3,R
1,CT3,NO,01/01/2017 01:00,3.6,ug m-3,R
2,CT3,NO,01/01/2017 02:00,2.2,ug m-3,R
...,...,...,...,...,...,...
43797,CT3,PM2.5,31/12/2017 21:00,5.0,ug m-3,R
43798,CT3,PM2.5,31/12/2017 22:00,5.0,ug m-3,R
43799,CT3,PM2.5,31/12/2017 23:00,6.0,ug m-3,R


In [9]:
df2 = df.drop(columns=['Site','Provisional or Ratified'])
df2

Unnamed: 0,Species,ReadingDateTime,Value,Units
0,NO,01/01/2017 00:00,3.5,ug m-3
1,NO,01/01/2017 01:00,3.6,ug m-3
2,NO,01/01/2017 02:00,2.2,ug m-3
...,...,...,...,...
43797,PM2.5,31/12/2017 21:00,5.0,ug m-3
43798,PM2.5,31/12/2017 22:00,5.0,ug m-3
43799,PM2.5,31/12/2017 23:00,6.0,ug m-3


In [10]:
df.drop(columns =['Site','Provisional or Ratified'], inplace=True)
df

Unnamed: 0,Species,ReadingDateTime,Value,Units
0,NO,01/01/2017 00:00,3.5,ug m-3
1,NO,01/01/2017 01:00,3.6,ug m-3
2,NO,01/01/2017 02:00,2.2,ug m-3
...,...,...,...,...
43797,PM2.5,31/12/2017 21:00,5.0,ug m-3
43798,PM2.5,31/12/2017 22:00,5.0,ug m-3
43799,PM2.5,31/12/2017 23:00,6.0,ug m-3


# Working with strings
Pandas provides powerful functions to manipulate strings as needed.

# Split
Let's spilit Data and time into two colomns

In [11]:
df.ReadingDateTime.str.split(' ')

0        [01/01/2017, 00:00]
1        [01/01/2017, 01:00]
2        [01/01/2017, 02:00]
                ...         
43797    [31/12/2017, 21:00]
43798    [31/12/2017, 22:00]
43799    [31/12/2017, 23:00]
Name: ReadingDateTime, Length: 43800, dtype: object

In [12]:
DateTime = df.ReadingDateTime.str.split(' ')
DateTime

0        [01/01/2017, 00:00]
1        [01/01/2017, 01:00]
2        [01/01/2017, 02:00]
                ...         
43797    [31/12/2017, 21:00]
43798    [31/12/2017, 22:00]
43799    [31/12/2017, 23:00]
Name: ReadingDateTime, Length: 43800, dtype: object

In [13]:
type(DateTime)

pandas.core.series.Series

In [14]:
DateTime2 = df.ReadingDateTime.str.split(' ', expand=True)
DateTime2

Unnamed: 0,0,1
0,01/01/2017,00:00
1,01/01/2017,01:00
2,01/01/2017,02:00
...,...,...
43797,31/12/2017,21:00
43798,31/12/2017,22:00
43799,31/12/2017,23:00


In [15]:
type(DateTime2)

pandas.core.frame.DataFrame

In [16]:
DateTime2.columns = ['Date','Time']
DateTime2

Unnamed: 0,Date,Time
0,01/01/2017,00:00
1,01/01/2017,01:00
2,01/01/2017,02:00
...,...,...
43797,31/12/2017,21:00
43798,31/12/2017,22:00
43799,31/12/2017,23:00


In [17]:
BM = DateTime2.Time == '01:00'
DateTime2[BM]

Unnamed: 0,Date,Time
1,01/01/2017,01:00
25,02/01/2017,01:00
49,03/01/2017,01:00
...,...,...
43729,29/12/2017,01:00
43753,30/12/2017,01:00
43777,31/12/2017,01:00


In [18]:
df[BM]

Unnamed: 0,Species,ReadingDateTime,Value,Units
1,NO,01/01/2017 01:00,3.6,ug m-3
25,NO,02/01/2017 01:00,2.3,ug m-3
49,NO,03/01/2017 01:00,37.8,ug m-3
...,...,...,...,...
43729,PM2.5,29/12/2017 01:00,12.0,ug m-3
43753,PM2.5,30/12/2017 01:00,9.0,ug m-3
43777,PM2.5,31/12/2017 01:00,9.0,ug m-3


# Pay attention:

In the past two code sections, we used a dataframe to create a boolean mask and used the boolean mask to filter out the other dataframe. 

Question: How come were we able to do this?

# join multipe dataframes

In [19]:
new_df = df.join(DateTime2).drop(columns=['ReadingDateTime'])
new_df


Unnamed: 0,Species,Value,Units,Date,Time
0,NO,3.5,ug m-3,01/01/2017,00:00
1,NO,3.6,ug m-3,01/01/2017,01:00
2,NO,2.2,ug m-3,01/01/2017,02:00
...,...,...,...,...,...
43797,PM2.5,5.0,ug m-3,31/12/2017,21:00
43798,PM2.5,5.0,ug m-3,31/12/2017,22:00
43799,PM2.5,6.0,ug m-3,31/12/2017,23:00


The basis of joining for this join function is the indices of the two joining data frames. Basically, the function puts together the columns that have the same indices.

# Example

Add three columns to the data frame that show day, month, and year of the data.

In [20]:
Date_df = new_df.Date.str.split('/', expand=True)
Date_df.columns = ['Day','Month','Year']

new_df = new_df.join(Date_df)
new_df

Unnamed: 0,Species,Value,Units,Date,Time,Day,Month,Year
0,NO,3.5,ug m-3,01/01/2017,00:00,01,01,2017
1,NO,3.6,ug m-3,01/01/2017,01:00,01,01,2017
2,NO,2.2,ug m-3,01/01/2017,02:00,01,01,2017
...,...,...,...,...,...,...,...,...
43797,PM2.5,5.0,ug m-3,31/12/2017,21:00,31,12,2017
43798,PM2.5,5.0,ug m-3,31/12/2017,22:00,31,12,2017
43799,PM2.5,6.0,ug m-3,31/12/2017,23:00,31,12,2017


# Replace

In [21]:
new_df.Time.str.replace('00:00','0')

0            0
1        01:00
2        02:00
         ...  
43797    21:00
43798    22:00
43799    23:00
Name: Time, Length: 43800, dtype: object

it is easiest to go about replacing using dictionaries. It is best to think of a dictionary as a set of key: value pairs, with the requirement that the keys are unique (within one dictionary). A pair of braces creates an empty dictionary: {}. Placing a comma-separated list of key:value pairs within the braces adds initial key:value pairs to the dictionary; this is also the way dictionaries are written on output.

In [22]:
ReplaceDict = {'00:00': '0',  '01:00': '1',  '02:00': '2',  '03:00': '3',  '04:00': '4', \
               '05:00': '5',  '06:00': '6',  '07:00': '7',  '08:00': '8',  '09:00': '9', \
               '10:00': '10', '11:00': '11', '12:00': '12', '13:00': '13', '14:00': '14', \
               '15:00': '15', '16:00': '16', '17:00': '17', '18:00': '18', '19:00': '19', \
               '20:00': '20', '21:00': '21', '22:00': '22', '23:00': '23'}
ReplaceDict

{'00:00': '0',
 '01:00': '1',
 '02:00': '2',
 '03:00': '3',
 '04:00': '4',
 '05:00': '5',
 '06:00': '6',
 '07:00': '7',
 '08:00': '8',
 '09:00': '9',
 '10:00': '10',
 '11:00': '11',
 '12:00': '12',
 '13:00': '13',
 '14:00': '14',
 '15:00': '15',
 '16:00': '16',
 '17:00': '17',
 '18:00': '18',
 '19:00': '19',
 '20:00': '20',
 '21:00': '21',
 '22:00': '22',
 '23:00': '23'}

In [23]:
new_df.replace({'Time': ReplaceDict}, inplace = True)
new_df

Unnamed: 0,Species,Value,Units,Date,Time,Day,Month,Year
0,NO,3.5,ug m-3,01/01/2017,0,01,01,2017
1,NO,3.6,ug m-3,01/01/2017,1,01,01,2017
2,NO,2.2,ug m-3,01/01/2017,2,01,01,2017
...,...,...,...,...,...,...,...,...
43797,PM2.5,5.0,ug m-3,31/12/2017,21,31,12,2017
43798,PM2.5,5.0,ug m-3,31/12/2017,22,31,12,2017
43799,PM2.5,6.0,ug m-3,31/12/2017,23,31,12,2017


# List Comprehension

Coding solution for creating collections of items lists, dictionaries that have a codable pattern. They are also known as in-line loops.


### Example
instead of typing out a=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20] use a list comprension to creat it.

In [24]:
a=[x for x in range(1,21)]
a

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

`[FormulaOnThing for thing in things]`

### Example
instead of typing out a=[3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60] use a list comprension to creat it.

In [25]:
a=[2*x for x in range(1,21)]
a

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40]

### Example
instead of typing out ReplaceDict = {'00:00': '0',  '01:00': '1',  '02:00': '2',  '03:00': '3',  '04:00': '4', '05:00': '5',  '06:00': '6',  '07:00': '7',  '08:00': '8',  '09:00': '9', '10:00': '10', '11:00': '11', '12:00': '12', '13:00': '13', '14:00': '14', '15:00': '15', '16:00': '16', '17:00': '17', '18:00': '18', '19:00': '19', '20:00': '20', '21:00': '21', '22:00': '22', '23:00': '23'}, use a list comprension to creat it.

In [26]:
ReplaceDict = {}
for i in range(24):
    if(i>9):
        ReplaceDict['%s:00' % str(i)] =  str(i)
    else:
        ReplaceDict['0%s:00' % str(i)] = str(i)
ReplaceDict

{'00:00': '0',
 '01:00': '1',
 '02:00': '2',
 '03:00': '3',
 '04:00': '4',
 '05:00': '5',
 '06:00': '6',
 '07:00': '7',
 '08:00': '8',
 '09:00': '9',
 '10:00': '10',
 '11:00': '11',
 '12:00': '12',
 '13:00': '13',
 '14:00': '14',
 '15:00': '15',
 '16:00': '16',
 '17:00': '17',
 '18:00': '18',
 '19:00': '19',
 '20:00': '20',
 '21:00': '21',
 '22:00': '22',
 '23:00': '23'}

In [27]:
ReplaceDict = {'%s:00' % str(i) if i>9 else '0%s:00' % str(i) : '%s' % str(i) for i in range(24)}
ReplaceDict

{'00:00': '0',
 '01:00': '1',
 '02:00': '2',
 '03:00': '3',
 '04:00': '4',
 '05:00': '5',
 '06:00': '6',
 '07:00': '7',
 '08:00': '8',
 '09:00': '9',
 '10:00': '10',
 '11:00': '11',
 '12:00': '12',
 '13:00': '13',
 '14:00': '14',
 '15:00': '15',
 '16:00': '16',
 '17:00': '17',
 '18:00': '18',
 '19:00': '19',
 '20:00': '20',
 '21:00': '21',
 '22:00': '22',
 '23:00': '23'}

In [28]:
new_df.replace({'Time': ReplaceDict}, inplace = True)
new_df

Unnamed: 0,Species,Value,Units,Date,Time,Day,Month,Year
0,NO,3.5,ug m-3,01/01/2017,0,01,01,2017
1,NO,3.6,ug m-3,01/01/2017,1,01,01,2017
2,NO,2.2,ug m-3,01/01/2017,2,01,01,2017
...,...,...,...,...,...,...,...,...
43797,PM2.5,5.0,ug m-3,31/12/2017,21,31,12,2017
43798,PM2.5,5.0,ug m-3,31/12/2017,22,31,12,2017
43799,PM2.5,6.0,ug m-3,31/12/2017,23,31,12,2017


# Reorder Columns

In [29]:
new_df = new_df[['Species', 'Units', 'Date','Year', 'Month','Day','Time','Value']]
new_df

Unnamed: 0,Species,Units,Date,Year,Month,Day,Time,Value
0,NO,ug m-3,01/01/2017,2017,01,01,0,3.5
1,NO,ug m-3,01/01/2017,2017,01,01,1,3.6
2,NO,ug m-3,01/01/2017,2017,01,01,2,2.2
...,...,...,...,...,...,...,...,...
43797,PM2.5,ug m-3,31/12/2017,2017,12,31,21,5.0
43798,PM2.5,ug m-3,31/12/2017,2017,12,31,22,5.0
43799,PM2.5,ug m-3,31/12/2017,2017,12,31,23,6.0


In [30]:
new_df.to_csv('LaqnData_cleaned.csv',index=False)