# Setup

## Import libraries

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Visualizations
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
plotly.offline.init_notebook_mode(connected=True)

import matplotlib as plt

from datetime import date

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

## Set today's date

In [2]:
today = date.today()

# dd/mm/YY
#d1 = today.strftime("%d/%m/%Y")
#print("d1 =", d1)

# Textual month, day and year	
#d2 = today.strftime("%B %d, %Y")
#print("d2 =", d2)

# mm-dd-y
d3 = today.strftime("%m-%d-%Y")
#print("d3 =", d3)

# Month abbreviation, day and year	
#d4 = today.strftime("%b-%d-%Y")
#print("d4 =", d4)

# Leave as is
filename = (f'{d3} Cleaned & Merged Countries.csv')
filename

'04-07-2020 Cleaned & Merged Countries.csv'


# Data import
We retrieve all the required data for the analysis.

## Import Country csvs

In [3]:
# Load in Johannesburg csv
johannesburg_csv = "Raw datasets/diepkloof,-johannesburg metro, south africa-air-quality.csv"

# Load in Seoul csv
seoul_csv = "Raw datasets/seoul-air-quality.csv"

# Load in Madrid csv
madrid_csv = "Raw datasets/madrid-air-quality.csv"

# Load in Merced csv
merced_csv = "Raw datasets/merced,-méxico, mexico-air-quality.csv"

# Load in Nauen csv
nauen_csv = "Raw datasets/nauen,-germany-air-quality.csv"

# Load in Lalbagh csv
lalbagh_csv = "Raw datasets/lalbagh,-lucknow, india-air-quality.csv"

# Load in LA csv
la_csv = "Raw datasets/los-angeles-north main street-air-quality.csv"

# Load in Sydney csv
sbrisbane_csv = "Raw datasets/south-brisbane, australia-air-quality.csv"

# Load in Vancouver csv
vancouver_csv = "Raw datasets/vancouver-international airport #2, british comlumbia, canada-air-quality.csv"

# Load in Wuhan csv
wuhan_csv = "Raw datasets/wuhan-air-quality.csv"

###### Display Johannesburg

In [4]:
# Read and display the csv with Pandas
johannesburg_pd = pd.read_csv(johannesburg_csv, low_memory=False)

print(f'''There are {len(johannesburg_pd)} rows''')
print(johannesburg_pd.shape)

#johannesburg_pd.head()
#johannesburg_pd.columns()

# Add location column and set their values to equal the location
johannesburg_pd['location'] = ''
order_johannesburg = johannesburg_pd.assign(location='Johannesburg')

order_johannesburg = order_johannesburg[["location", "date"," pm25", " pm10", " o3", " no2", " so2", " co"]]

order_johannesburg

There are 582 rows
(582, 7)


Unnamed: 0,location,date,pm25,pm10,o3,no2,so2,co
0,Johannesburg,2020/4/2,68,10,4,5,4,2
1,Johannesburg,2020/4/3,36,14,4,6,1,3
2,Johannesburg,2020/4/4,38,22,4,8,1,4
3,Johannesburg,2020/4/5,54,25,8,5,1,3
4,Johannesburg,2020/4/6,71,13,8,5,1,3
...,...,...,...,...,...,...,...,...
577,Johannesburg,2018/12/12,,33,16,7,1,7
578,Johannesburg,2018/8/27,,22,11,6,1,2
579,Johannesburg,2019/5/2,,,7,5,3,14
580,Johannesburg,2019/5/9,,,2,12,1,4


###### Display Seoul 

In [5]:
# Read and display the csv with Pandas
seoul_pd = pd.read_csv(seoul_csv, low_memory=False)

print(f'''There are {len(seoul_pd)} rows''')
print(seoul_pd.shape)

#seoul_pd.head()
#seoul_pd.columns()

# Add location column and set their values to equal the location
seoul_pd['location'] = ''
order_seoul = seoul_pd.assign(location='Seoul')

order_seoul = order_seoul[["location", "date"," pm25", " pm10", " o3", " no2", " so2", " co"]]

order_seoul

There are 2227 rows
(2227, 7)


Unnamed: 0,location,date,pm25,pm10,o3,no2,so2,co
0,Seoul,2020/4/2,103,42,31,30,4,4
1,Seoul,2020/4/3,84,58,41,33,5,6
2,Seoul,2020/4/4,113,67,38,17,4,4
3,Seoul,2020/4/5,92,47,43,14,3,4
4,Seoul,2020/4/6,71,57,36,40,4,5
...,...,...,...,...,...,...,...,...
2222,Seoul,2014/3/29,,93,22,50,9,7
2223,Seoul,2014/3/30,,53,44,26,7,5
2224,Seoul,2014/3/31,,38,23,65,9,8
2225,Seoul,2014/4/1,,61,20,77,10,9


###### Display Madrid

In [6]:
# Read and display the csv with Pandas
madrid_pd = pd.read_csv(madrid_csv, low_memory=False)

print(f'''There are {len(madrid_pd)} rows''')
print(madrid_pd.shape)

#madrid_pd.head()
#madrid_pd.columns()

# Add location column and set their values to equal the location
madrid_pd['location'] = ''
order_madrid = madrid_pd.assign(location='Madrid')

order_madrid = order_madrid[["location", "date"," pm25", " pm10", " o3", " no2", " so2", " co"]]

order_madrid

There are 2255 rows
(2255, 7)


Unnamed: 0,location,date,pm25,pm10,o3,no2,so2,co
0,Madrid,2020/4/2,37,10,29,14,2,
1,Madrid,2020/4/3,35,11,36,12,2,
2,Madrid,2020/4/4,39,11,34,10,2,
3,Madrid,2020/4/5,42,14,34,5,2,
4,Madrid,2020/4/6,45,17,26,8,1,
...,...,...,...,...,...,...,...,...
2250,Madrid,2015/1/1,,47,10,65,11,
2251,Madrid,2014/10/12,,11,33,11,1,
2252,Madrid,2014/1/1,,13,19,18,2,
2253,Madrid,2014/1/27,,13,30,21,3,


###### Display Merced

In [7]:
# Read and display the csv with Pandas
merced_pd = pd.read_csv(merced_csv, low_memory=False)

print(f'''There are {len(merced_pd)} rows''')
print(merced_pd.shape)

#merced_pd.head()
#merced_pd.columns()

# Add location column and set their values to equal the location
merced_pd['location'] = ''
order_merced = merced_pd.assign(location='Merced')

order_merced = order_merced[["location", "date"," pm25", " pm10", " o3", " no2", " so2", " co"]]

order_merced

There are 1183 rows
(1183, 7)


Unnamed: 0,location,date,pm25,pm10,o3,no2,so2,co
0,Merced,2020/4/2,82,47,70,,9,9
1,Merced,2020/4/3,82,38,48,,3,10
2,Merced,2020/4/4,71,32,74,21,2,10
3,Merced,2020/4/5,66,30,61,15,1,9
4,Merced,2020/4/6,59,40,23,27,7,12
...,...,...,...,...,...,...,...,...
1178,Merced,2017/3/27,,,59,37,8,11
1179,Merced,2017/3/28,,,49,35,6,11
1180,Merced,2016/12/14,,,48,49,11,11
1181,Merced,2016/12/15,,,26,33,12,10


###### Display Nauen

In [8]:
# Read and display the csv with Pandas
nauen_pd = pd.read_csv(nauen_csv, low_memory=False)

print(f'''There are {len(nauen_pd)} rows''')
print(nauen_pd.shape)

#nauen_pd.head()
#nauen_pd.columns()

# Add location column and set their values to equal the location
nauen_pd['location'] = ''
order_nauen = nauen_pd.assign(location='Nauen')

order_nauen = order_nauen[["location", "date"," pm25", " pm10", " o3", " no2", " so2", " co"]]

order_nauen

There are 1568 rows
(1568, 7)


Unnamed: 0,location,date,pm25,pm10,o3,no2,so2,co
0,Nauen,2020/4/2,22,14,35,3,,
1,Nauen,2020/4/3,41,12,33,2,,
2,Nauen,2020/4/4,28,20,37,3,,
3,Nauen,2020/4/5,28,27,41,5,1,
4,Nauen,2020/4/6,57,39,48,5,2,
...,...,...,...,...,...,...,...,...
1563,Nauen,2015/12/15,,,,10,,
1564,Nauen,2016/1/3,,,5,13,,
1565,Nauen,2015/1/1,,,16,6,,
1566,Nauen,2014/11/10,,,3,9,,


###### Display Lalbagh

In [9]:
# Read and display the csv with Pandas
lalbagh_pd = pd.read_csv(lalbagh_csv, low_memory=False)

print(f'''There are {len(lalbagh_pd)} rows''')
print(lalbagh_pd.shape)

#lalbagh_pd.head()
#lalbagh_pd.columns()

# Add location column and set their values to equal the location
lalbagh_pd['location'] = ''
order_lalbagh = lalbagh_pd.assign(location='Lalbagh')

order_lalbagh = order_lalbagh[["location", "date"," pm25", " pm10", " o3", " no2", " so2", " co"]]

order_lalbagh

There are 1381 rows
(1381, 7)


Unnamed: 0,location,date,pm25,pm10,o3,no2,so2,co
0,Lalbagh,2018/10/2,171,,39,17,1,11
1,Lalbagh,2018/10/3,171,,22,20,2,9
2,Lalbagh,2018/10/4,174,,16,28,2,14
3,Lalbagh,2018/10/5,177,,18,24,2,10
4,Lalbagh,2018/10/6,168,,20,28,4,10
...,...,...,...,...,...,...,...,...
1376,Lalbagh,2016/5/26,,,,14,2,11
1377,Lalbagh,2015/10/9,,,,4,5,
1378,Lalbagh,2015/8/21,,,,2,3,12
1379,Lalbagh,2015/3/23,,,,6,1,10


###### Display LA

In [10]:
# Read and display the csv with Pandas
la_pd = pd.read_csv(la_csv, low_memory=False)

print(f'''There are {len(la_pd)} rows''')
print(la_pd.shape)

la_pd.head()
#la_pd.columns()

# Add location column and set their values to equal the location
la_pd['location'] = ''
order_la = la_pd.assign(location='Los Angeles')

order_la = order_la[["location", "date"," pm25", " pm10", " o3", " no2", " so2", " co"]]

order_la

There are 2278 rows
(2278, 7)


Unnamed: 0,location,date,pm25,pm10,o3,no2,so2,co
0,Los Angeles,2020/4/2,49,25,29,5,,1
1,Los Angeles,2020/4/3,47,28,32,11,,2
2,Los Angeles,2020/4/4,50,,30,7,,2
3,Los Angeles,2020/3/2,38,22,17,15,,5
4,Los Angeles,2020/3/3,31,28,25,22,1,4
...,...,...,...,...,...,...,...,...
2273,Los Angeles,2014/7/20,,,10,1,,3
2274,Los Angeles,2014/7/21,,,6,2,,4
2275,Los Angeles,2014/7/22,,,5,3,,5
2276,Los Angeles,2014/7/23,,,3,4,,6


###### Display South Brisbane

In [11]:
# Read and display the csv with Pandas
sbrisbane_pd = pd.read_csv(sbrisbane_csv, low_memory=False)

print(f'''There are {len(sbrisbane_pd)} rows''')
print(sbrisbane_pd.shape)

sbrisbane_pd.head()
#sbrisbane_pd.columns()

# Add location column and set their values to equal the location
sbrisbane_pd['location'] = ''
order_sbrisbane = sbrisbane_pd.assign(location='South Brisbane')

order_sbrisbane = order_sbrisbane[["location", "date"," pm25", " pm10", " o3", " no2", " so2", " co"]]

order_sbrisbane

There are 2068 rows
(2068, 7)


Unnamed: 0,location,date,pm25,pm10,o3,no2,so2,co
0,South Brisbane,2020/4/2,23,17,,5,,1
1,South Brisbane,2020/4/3,23,10,,4,,1
2,South Brisbane,2020/4/4,17,11,,3,,2
3,South Brisbane,2020/4/5,19,32,,3,,1
4,South Brisbane,2020/4/6,39,21,,5,,2
...,...,...,...,...,...,...,...,...
2063,South Brisbane,2015/1/1,,14,,1,,1
2064,South Brisbane,2015/1/23,,8,,9,,3
2065,South Brisbane,2015/2/9,,7,,4,,1
2066,South Brisbane,2014/6/1,,9,,13,,2


###### Display Vancouver

In [12]:
# Read and display the csv with Pandas
vancouver_pd = pd.read_csv(vancouver_csv, low_memory=False)

print(f'''There are {len(vancouver_pd)} rows''')
print(vancouver_pd.shape)

vancouver_pd.head()
#vancouver_pd.columns()

# Add location column and set their values to equal the location
vancouver_pd['location'] = ''
order_vancouver = vancouver_pd.assign(location='Vancouver')

order_vancouver = order_vancouver[["location", "date"," pm25", " pm10", " o3", " no2", " so2", " co"]]

order_vancouver

There are 784 rows
(784, 7)


Unnamed: 0,location,date,pm25,pm10,o3,no2,so2,co
0,Vancouver,2020/4/2,17,7,13,2,,
1,Vancouver,2020/4/3,22,4,15,1,,
2,Vancouver,2020/4/4,11,4,15,3,,
3,Vancouver,2020/4/5,12,6,14,3,,
4,Vancouver,2020/4/6,17,15,11,9,,
...,...,...,...,...,...,...,...,...
779,Vancouver,2018/3/30,,9,13,3,,
780,Vancouver,2018/3/31,,4,8,8,,
781,Vancouver,2018/4/1,,9,17,2,,
782,Vancouver,2018/10/2,,,13,3,,


###### Display Wuhan

In [13]:
# Read and display the csv with Pandas
wuhan_pd = pd.read_csv(wuhan_csv, low_memory=False)

print(f'''There are {len(wuhan_pd)} rows''')
print(wuhan_pd.shape)

#wuhan_pd.head()
#wuhan_pd.columns()

# Add location column and set their values to equal the location
wuhan_pd['location'] = ''
order_wuhan = wuhan_pd.assign(location='Wuhan')

order_wuhan = order_wuhan[["location", "date"," pm25", " pm10", " o3", " no2", " so2", " co"]]

order_wuhan

There are 2164 rows
(2164, 7)


Unnamed: 0,location,date,pm25,pm10,o3,no2,so2,co
0,Wuhan,2020/4/2,101,46,45,13,3,8
1,Wuhan,2020/4/3,121,62,43,15,4,9
2,Wuhan,2020/4/4,156,60,61,18,5,10
3,Wuhan,2020/4/5,150,64,85,20,5,9
4,Wuhan,2020/4/6,151,58,57,22,6,9
...,...,...,...,...,...,...,...,...
2159,Wuhan,2017/1/2,,119,30,47,9,14
2160,Wuhan,2016/1/4,,182,17,32,12,24
2161,Wuhan,2015/1/1,,109,29,43,31,20
2162,Wuhan,2014/5/4,,87,38,26,10,6


# Merge all csvs

### Data processing

#### Join Files

In [14]:
merge_countries = pd.concat([order_johannesburg,
order_seoul,
order_madrid,
order_merced,
order_nauen,
order_lalbagh,
order_la,
order_sbrisbane,
order_vancouver,
order_wuhan])

merge_countries

merge_countries.fillna(0)

Unnamed: 0,location,date,pm25,pm10,o3,no2,so2,co
0,Johannesburg,2020/4/2,68,10,4,5,4,2
1,Johannesburg,2020/4/3,36,14,4,6,1,3
2,Johannesburg,2020/4/4,38,22,4,8,1,4
3,Johannesburg,2020/4/5,54,25,8,5,1,3
4,Johannesburg,2020/4/6,71,13,8,5,1,3
...,...,...,...,...,...,...,...,...
2159,Wuhan,2017/1/2,,119,30,47,9,14
2160,Wuhan,2016/1/4,,182,17,32,12,24
2161,Wuhan,2015/1/1,,109,29,43,31,20
2162,Wuhan,2014/5/4,,87,38,26,10,6


#### Sort

In [15]:
merge_countries = merge_countries.sort_values(by=['location', 'date'], ascending=True)

merge_countries.reset_index(drop=True)

# OPTIONAL: Replace empty data with a 0
merge_countries.replace(" ", 0, inplace=True)

merge_countries

Unnamed: 0,location,date,pm25,pm10,o3,no2,so2,co
537,Johannesburg,2018/10/1,52,20,17,9,1,5
428,Johannesburg,2018/10/10,71,32,14,11,3,5
429,Johannesburg,2018/10/11,77,32,15,12,4,5
430,Johannesburg,2018/10/12,81,42,15,11,2,5
431,Johannesburg,2018/10/13,91,23,7,8,2,3
...,...,...,...,...,...,...,...,...
2,Wuhan,2020/4/4,156,60,61,18,5,10
3,Wuhan,2020/4/5,150,64,85,20,5,9
4,Wuhan,2020/4/6,151,58,57,22,6,9
5,Wuhan,2020/4/7,115,61,39,24,4,8


# Save file as...

In [16]:
merge_countries.to_csv(filename, index=False)