# Descrption

## Import library

In [1]:
import pandas as pd
import numpy as np
import glob 
import os

## Function

In [2]:
def insert_PM_mean(city_data,cal_cols):
    inserted_city_data = city_data
    PM_means = []
    for row in city_data[cal_cols].iterrows():
        PM_means.append("{:.3f}".format(np.mean(row[1].values)))
    inserted_city_data.insert(len(city_data.columns),'PM_Mean',PM_means)
    return inserted_city_data

def convert_to_num(cbwd_in_str):
    if cbwd_in_str == 'NE':
        cbwd_in_int = 1
    elif cbwd_in_str == 'SE':
        cbwd_in_int = 2
    elif cbwd_in_str == 'NW':
        cbwd_in_int = 3
    elif cbwd_in_str == 'SW':
        cbwd_in_int = 4
    else:
        cbwd_in_int = 0
    return cbwd_in_int

def get_columns(city_data, columns_name = 'PM_', neddel_cols = ['day', 'hour']):
    """
    parameter:
        city_data is the air quality data from a city.
        columns_name is the PM data column filter       
        neddel_cols is the extra column need to be deleted
    return:
         PM_columns,dropped_columns name of those columns
    """
    PM_columns = []
    dropped_columns = []
    for column in city_data.columns:
        if columns_name in column:
            PM_columns.append(column)
    dropped_columns = neddel_cols + PM_columns
    return PM_columns,dropped_columns

## Data preprocessing

In [3]:
source_urls = glob.glob('E:/UVM2017 fall/CS 295 Machine Learning/Final_Project/*PM20130101_20151231.csv')
datasets = {}
for url in source_urls:
    city_name = url.split('\\')[1][:-23].lower() #source_urls[2].split('\\')[1][:-23] = 'Guangzhou'
    with open(url, 'r') as fin:
        datasets[city_name] = pd.read_csv(fin, index_col = 0)

In [4]:
datasets['chengdu'].head()

Unnamed: 0_level_0,year,month,day,hour,season,PM_Caotangsi,PM_Shahepu,PM_US Post,DEWP,HUMI,PRES,TEMP,cbwd,Iws,precipitation,Iprec
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,2013,1,1,0,4,121.0,138.0,129.0,-4.0,64.42,1022.0,2.0,cv,1.0,0.0,0.0
2,2013,1,1,1,4,134.0,159.0,135.0,-3.0,80.19,1022.0,0.0,cv,1.0,0.0,0.0
3,2013,1,1,2,4,,,,-3.0,80.19,1022.0,0.0,cv,1.0,0.0,0.0
4,2013,1,1,3,4,203.0,162.0,,-3.0,80.19,1022.0,0.0,SW,1.0,0.0,0.0
5,2013,1,1,4,4,217.0,157.0,,-1.0,100.0,1021.0,-1.0,cv,0.0,0.0,0.0


In [5]:
list(datasets['shanghai'].columns[5:8])

['PM_Jingan', 'PM_US Post', 'PM_Xuhui']

In [6]:
for city in datasets:
    print(city,len(datasets[city]))

beijing 26280
chengdu 26280
guangzhou 26280
shanghai 25602
shenyang 26280


## Variables defintion
** Some useful colums' lists **

In [7]:
PM_columns = ['PM_Jingan','PM_US Post','PM_Xuhui']
dropped_columns = ['day','hour'] + PM_columns

** Data cleaning**

In [8]:
cleaned_datasets = {} 
for city in datasets:
    PM_cols = []
    dropped_cols = []
    PM_cols, dropped_cols =  get_columns(datasets[city])
    cleaned_datasets[city] = datasets[city].dropna(axis = 0, how = 'any')
    cleaned_datasets[city] = cleaned_datasets[city].reset_index(drop = True)
    cleaned_datasets[city] = insert_PM_mean(cleaned_datasets[city],PM_cols)
    cleaned_datasets[city] = cleaned_datasets[city].drop(dropped_cols,axis = 1)
    cleaned_datasets[city]['cbwd'] = cleaned_datasets[city]['cbwd'].apply(convert_to_num)

In [9]:
for city in cleaned_datasets:
    if not os.path.isfile(str(city + 'PM20130101_20151231(cleaned).csv')):  #check if the file is exists
        cleaned_datasets[city].to_csv(str(city+'PM20130101_20151231(cleaned).csv'))

In [10]:
for city in cleaned_datasets:
    print(city)
    print(cleaned_datasets[city].head())
    print(cleaned_datasets[city]['cbwd'].value_counts())

beijing
   year  month  season  DEWP  HUMI    PRES  TEMP  cbwd   Iws  precipitation  \
0  2013      3       1  -6.0  59.0  1019.0   1.0     0  0.89            0.0   
1  2013      3       1  -8.0  38.0  1019.0   5.0     0  1.78            0.0   
2  2013      3       1  -8.0  31.0  1018.0   8.0     0  2.67            0.0   
3  2013      3       1  -9.0  23.0  1017.0  11.0     0  4.45            0.0   
4  2013      3       1  -9.0  22.0  1015.0  12.0     0  5.34            0.0   

   Iprec  PM_Mean  
0    0.0  143.250  
1    0.0  152.750  
2    0.0  153.500  
3    0.0  133.750  
4    0.0  132.000  
2    6872
3    5620
0    4318
1    2252
Name: cbwd, dtype: int64
chengdu
   year  month  season  DEWP   HUMI    PRES  TEMP  cbwd  Iws  precipitation  \
0  2013      1       4  -4.0  64.42  1022.0   2.0     0  1.0            0.0   
1  2013      1       4  -3.0  80.19  1022.0   0.0     0  1.0            0.0   
2  2013      1       4  -9.0  26.99  1018.0   9.0     4  1.0            0.0   
3  2013 