## Obtain datasets

We use the spidered datas of Beijing Municipal Environmental Monitoring Center. The datas contians hour by hour PM2.5/PM10/AQI and SO2/NO2/O3/CO numbers of 36 points in Beijing city.

In [28]:
from __future__ import print_function
import sqlite3
import os
from datetime import date, timedelta
from six.moves.urllib.request import urlretrieve
import uuid

First, we'll download the data csv files of specific dates to our local machine.

In [34]:
start_date = date(2016, 1, 1)
end_date = date.today()
data_folder = 'data/'

#http://beijingair.sinaapp.com/data/beijing/all/20131205/csv。
#http://beijingair-pm25.stor.sinaapp.com/beijing_all_20131205.csv
#url = 'http://commondatastorage.googleapis.com/books1000/'

def maybe_download(date, parent_folder, force=False):
  """Download a file if not present."""
  filename = os.path.join(parent_folder, date + '.csv')
  if force or not os.path.exists(filename):
    filename, _ = urlretrieve('http://beijingair-pm25.stor.sinaapp.com/beijing_all_' + date + '.csv', filename)
  return filename

def load_csvs(start_date, end_date):
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    d = start_date
    results = []
    while d < end_date:
        results.append( maybe_download(d.strftime("%Y%m%d"), data_folder) )
        d += timedelta(days = 1)
        
    return results

csv_files = load_csvs(start_date, end_date)
print (csv_files)

['data/20160101.csv', 'data/20160102.csv', 'data/20160103.csv', 'data/20160104.csv', 'data/20160105.csv', 'data/20160106.csv', 'data/20160107.csv', 'data/20160108.csv', 'data/20160109.csv', 'data/20160110.csv', 'data/20160111.csv', 'data/20160112.csv', 'data/20160113.csv', 'data/20160114.csv', 'data/20160115.csv', 'data/20160116.csv', 'data/20160117.csv', 'data/20160118.csv', 'data/20160119.csv', 'data/20160120.csv', 'data/20160121.csv', 'data/20160122.csv', 'data/20160123.csv', 'data/20160124.csv', 'data/20160125.csv', 'data/20160126.csv', 'data/20160127.csv', 'data/20160128.csv', 'data/20160129.csv', 'data/20160130.csv', 'data/20160131.csv', 'data/20160201.csv', 'data/20160202.csv', 'data/20160203.csv', 'data/20160204.csv', 'data/20160205.csv', 'data/20160206.csv', 'data/20160207.csv', 'data/20160208.csv', 'data/20160209.csv', 'data/20160210.csv', 'data/20160211.csv', 'data/20160212.csv', 'data/20160213.csv', 'data/20160214.csv', 'data/20160215.csv', 'data/20160216.csv', 'data/201602

This site is anti-spider, so downloaded data would be corrupt. Let's fix it by phantomjs later, but at this moment, we use the data from Baidu disk.

In [39]:
csv_files = [
    os.path.join(data_folder, d) for d in sorted(os.listdir(data_folder))
    if d != '.DS_Store']
print (csv_files)

['data/beijing_all_20160101.csv', 'data/beijing_all_20160102.csv', 'data/beijing_all_20160103.csv', 'data/beijing_all_20160104.csv', 'data/beijing_all_20160105.csv', 'data/beijing_all_20160106.csv', 'data/beijing_all_20160107.csv', 'data/beijing_all_20160108.csv', 'data/beijing_all_20160109.csv', 'data/beijing_all_20160110.csv', 'data/beijing_all_20160111.csv', 'data/beijing_all_20160112.csv', 'data/beijing_all_20160113.csv', 'data/beijing_all_20160114.csv', 'data/beijing_all_20160115.csv', 'data/beijing_all_20160116.csv', 'data/beijing_all_20160117.csv', 'data/beijing_all_20160118.csv', 'data/beijing_all_20160119.csv', 'data/beijing_all_20160120.csv', 'data/beijing_all_20160121.csv', 'data/beijing_all_20160122.csv', 'data/beijing_all_20160123.csv', 'data/beijing_all_20160124.csv', 'data/beijing_all_20160125.csv', 'data/beijing_all_20160126.csv', 'data/beijing_all_20160127.csv', 'data/beijing_all_20160128.csv', 'data/beijing_all_20160129.csv', 'data/beijing_all_20160130.csv', 'data/bei

Now let's import csv files to sqlite database.

CREATE TABLE AQI (
  uuid text PRIMARY KEY NOT NULL,
  date char(16),
  hour integer(128),
  type text(128),
  point text(128),
  value integer(128)
);

CREATE TABLE Points (
  uuid text PRIMARY KEY NOT NULL,
  name text(128)
);

In [83]:
db_filename = 'beijing.sqlite'

def import_csv_to_sqlite(files):
    conn = sqlite3.connect(db_filename)
    for fn in files:
        print(fn)
        with open(fn, 'rb') as f:
            line = f.readline()
            #print(line)
            header = line.split(',')
            #print(header)
            line = f.readline()
            while line:
                line = line.replace('\n','')
                line = line.replace('\r','')
                #print(line)
                row = line.split(',')
                thedate = row[0]
                thehour = row[1]
                thetype = row[2]
                for i in xrange(len(row)):
                    if i < 3:
                        continue
                    if len(row[i]) < 1:
                        continue
                    if i >= len(header):
                        break
                    #print(i)
                    conn.execute(\
                        "insert into AQI (`uuid`, `date`, `hour`, `type`, `point`, `value`) values " + \
                         "('%s','%s',%s,'%s','%s',%s);" % (uuid.uuid1(), thedate, thehour, thetype, header[i], row[i]))
                    conn.commit()
                    
                line = f.readline()
    

import_csv_to_sqlite(csv_files)




data/beijing_all_20160101.csv
data/beijing_all_20160102.csv
data/beijing_all_20160103.csv
data/beijing_all_20160104.csv
data/beijing_all_20160105.csv
data/beijing_all_20160106.csv
data/beijing_all_20160107.csv
data/beijing_all_20160108.csv
data/beijing_all_20160109.csv
data/beijing_all_20160110.csv
data/beijing_all_20160111.csv
data/beijing_all_20160112.csv
data/beijing_all_20160113.csv
data/beijing_all_20160114.csv
data/beijing_all_20160115.csv
data/beijing_all_20160116.csv
data/beijing_all_20160117.csv
data/beijing_all_20160118.csv
data/beijing_all_20160119.csv
data/beijing_all_20160120.csv
data/beijing_all_20160121.csv
data/beijing_all_20160122.csv
data/beijing_all_20160123.csv
data/beijing_all_20160124.csv
data/beijing_all_20160125.csv
data/beijing_all_20160126.csv
data/beijing_all_20160127.csv
data/beijing_all_20160128.csv
data/beijing_all_20160129.csv
data/beijing_all_20160130.csv
data/beijing_all_20160131.csv
data/beijing_all_20160201.csv
data/beijing_all_20160202.csv
data/beiji

KeyboardInterrupt: 