# 使用 sklearn 预测空气污染状况

空气污染状况愈发严重，这种情况需要公众的重视。本项目使用 sklearn 的线性回归模型预测城市空气污染状况。

在这里选择北京市的空气质量数据和全国的天气气象数据。这里选取与空气质量有关的四个参数：每小时的空气温度，露点温度，风向角度，风速。

首先要对离散的数据进行聚合：

导入库：

In [47]:
import os
from types import *
from typing import *
import re
import json

data: Dict[str, Any] = dict()

对北京市的空气质量数据进行收集：

In [48]:
for fn in os.listdir("beijingAir"):
    f = open('beijingAir/' + fn, encoding='utf8')
    #f = open('beijingAir/beijing_all_20140503.csv', encoding='utf8')
    day: Dict[str, List[Tuple[str, float]]] = dict()
    for d in f.readlines()[1:]:
        row = d.split(',')
        if len(row) < 4 or row[2].find('h') != -1:
            continue
        time = row[0] + row[1]
        num = 0.0
        try:
            num = float(row[3])
        except:
            continue
        if time not in day:
            day[time] = list()
        day[time].append((row[2], num))
    
    for time in day:
        d = dict()
        d['time'] = time
        for i in day[time]:
            d[i[0]] = i[1]
        data[time] = d

对全国天气数据进行收集：

In [49]:
for d in os.listdir('global/'):
    for dd in os.listdir('global/' + d):
        for fn in os.listdir('global/' + d + '/' + dd):    
            if fn.find('545110') == -1:
                continue
            f = open('global/' + d + '/' + dd +  '/' + fn)
            rows = f.readlines()
            for r in rows:
                col = re.split('\s+', r)
                if len(col) < 9:
                    continue
                if col[4] == '-9999' or col[5] == '-9999' or col[7] == '-9999' or col[8] == '-9999':
                    continue
                time = col[0] + col[1] + col[2] + col[3]
                if time not in data:
                    continue
                Atemp: float
                Dtemp: float
                windA: float
                windR: float
                try:
                    Atemp = float(col[4])
                    Dtemp = float(col[5])
                    windA = float(col[7])
                    windR = float(col[8])
                except:
                    continue
                data[time]['Atemp'] = Atemp / 10.0
                data[time]['Dtemp'] = Dtemp / 10.0
                data[time]['windA'] = windA / 10.0
                data[time]['windR'] = windR / 10.0

删除残缺的数据：

In [50]:
delLst = []
for k in data:
    if len(data[k]) != 8:
        delLst.append(k)

for i in delLst:
    data.pop(i)


以json格式输出：

In [51]:
data = json.dumps(data)
ff = open('data.json', 'w')
ff.write(data)
ff.close()

接下来进行建模：

导入库：

In [52]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import numpy as np
from typing import *
import json
import numpy as np
import matplotlib.pyplot as plt

data = dict()
lst = [([],[]), ([],[]), ([], [])]
x_key = ['PM2.5', 'PM10', 'AQI']

打开json文件并读取处理：

In [53]:
with open('data.json') as f:
    data = json.load(f)
    for k in data:
        y = []
        for name in data[k]:
            if name == 'time':
                continue
            try:
                i = x_key.index(name)
            except:
                i = -1
            
            if i != -1:
                lst[i][0].append(data[k][name])
            else:
                y.append(data[k][name])
        for i in range(0, 3):
            lst[i][1].append(y)


转化为numpy数组：

In [54]:
X, y = np.array(lst[0][1]), np.array(lst[0][0])
print(X.shape)
print(y.shape)

(20087, 4)
(20087,)


将 20% 的 数据用于测试，其余用于训练：

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=532)

开始进行模型训练：

In [56]:
linreg = linear_model.LinearRegression()
#训练
model = linreg.fit(X_train, y_train)

生成拟合的公式：

In [57]:
def generate_formula(coef, intercept):
    def f(Atemp: float, Dtemp: float, windA: float, windR: float):
        return intercept + Atemp * coef[0] + Dtemp * coef[1] + windA * coef[2] + windR + coef[3]
    return f

f = generate_formula(linreg.coef_, linreg.intercept_)

测试公式：

In [58]:
print(f(20, 18, 40, 20))

87.70874291738609
