In [1]:
import pandas as pd
from datetime import datetime
from osgeo import gdal
import numpy as np
import subprocess
import glob
from shapely.geometry import Polygon
import matplotlib.pyplot as plt
from dateutil.parser import parse
from tensorflow import keras 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score

In [2]:
train_labels = pd.read_csv("train_labels.csv")
grid_metadata = pd.read_csv("grid_metadata.csv")
satellite_metadata = pd.read_csv("pm25_satellite_metadata.csv")
satellite_metadata['Date'] =  pd.to_datetime(satellite_metadata['time_end'], format='%Y-%m-%d')

In [3]:
test_labels = pd.read_csv("submission_format.csv")

In [8]:
train_labels

Unnamed: 0,datetime,grid_id,value
0,2018-02-01T08:00:00Z,3S31A,11.400000
1,2018-02-01T08:00:00Z,A2FBI,17.000000
2,2018-02-01T08:00:00Z,DJN0F,11.100000
3,2018-02-01T08:00:00Z,E5P9N,22.100000
4,2018-02-01T08:00:00Z,FRITQ,29.800000
...,...,...,...
34307,2020-12-31T18:30:00Z,P8JA5,368.611111
34308,2020-12-31T18:30:00Z,PW0JT,294.425000
34309,2020-12-31T18:30:00Z,VXNN3,224.857143
34310,2020-12-31T18:30:00Z,VYH7U,287.000000


In [81]:
sorted(list(set(test_labels[ test_labels["datetime"].str.contains("2018-01-01")]["datetime"])))

['2018-01-01T08:00:00Z', '2018-01-01T16:00:00Z', '2018-01-01T18:30:00Z']

In [9]:
test_labels

Unnamed: 0,datetime,grid_id,value
0,2017-01-07T16:00:00Z,1X116,0.0
1,2017-01-07T16:00:00Z,9Q6TA,0.0
2,2017-01-07T16:00:00Z,KW43U,0.0
3,2017-01-07T16:00:00Z,VR4WG,0.0
4,2017-01-07T16:00:00Z,XJF9O,0.0
...,...,...,...
13499,2021-08-24T08:00:00Z,QJHW4,0.0
13500,2021-08-24T08:00:00Z,VBLD0,0.0
13501,2021-08-24T08:00:00Z,WT52R,0.0
13502,2021-08-24T08:00:00Z,ZP1FZ,0.0


In [10]:
# Let us do this for one only|

In [16]:
loc = "1X116"
one_train = train_labels[train_labels["grid_id"]==loc]
one_test = test_labels[test_labels["grid_id"]==loc]
one_train.head()

Unnamed: 0,datetime,grid_id,value
12,2018-02-01T16:00:00Z,1X116,12.857143
31,2018-02-02T16:00:00Z,1X116,27.0625
61,2018-02-03T16:00:00Z,1X116,8.125
91,2018-02-04T16:00:00Z,1X116,13.3
122,2018-02-05T16:00:00Z,1X116,22.0


In [18]:
one_test.head()

Unnamed: 0,datetime,grid_id,value
0,2017-01-07T16:00:00Z,1X116,0.0
17,2017-01-08T16:00:00Z,1X116,0.0
34,2017-01-09T16:00:00Z,1X116,0.0
51,2017-01-10T16:00:00Z,1X116,0.0
56,2017-01-11T16:00:00Z,1X116,0.0


In [19]:
col = one_train["datetime"]

In [62]:
t_final = []


for loc in list(set(train_labels["grid_id"])):
    print(loc)
    one_train = train_labels[train_labels["grid_id"]==loc]
    one_test = test_labels[test_labels["grid_id"]==loc]
    col = one_train["datetime"]
    for i in one_test["datetime"]:
        strr_search = i[5:]
#         print(strr_search)
        vals = one_train[col.str.contains(strr_search)]["value"]
#         print(vals)
        mean_pm25 = vals.mean()
        if(mean_pm25!=mean_pm25):
            mean_pm25 = t_final[-1][2]
        t_final.append((i,loc,mean_pm25))

E2AUK
HANW9
HM74A
6EIL6
PG3MI
7F1D1
IUMEZ
QJHW4
90S79
WZNCR
AZJ0Z
A7UCQ
FRITQ
78V83
ZP1FZ
9Q6TA
VBLD0
H96P6
SZLMT
C7PGV
KW43U
S77YN
VXNN3
DJN0F
PW0JT
90BZ1
ZF3ZW
XNLVD
GAC6R
UC74Z
GJLB2
PJNW1
3S31A
D7S1G
NE7BV
YHOPV
VR4WG
CPR0W
KZ9W9
E5P9N
WT52R
D72OT
XJF9O
DHO4M
P8JA5
GVQXS
ZZ8JF
VYH7U
X5DKW
A2FBI
7334C
1X116
1Z2W7
8KNI6


In [66]:
test_cal = pd.DataFrame(t_final, columns =["datetime","grid_id","calc_value"]) 
test_cal.head()

Unnamed: 0,datetime,grid_id,calc_value
0,2017-11-27T18:30:00Z,E2AUK,61.41751
1,2017-11-28T18:30:00Z,E2AUK,71.205725
2,2017-11-29T18:30:00Z,E2AUK,104.593217
3,2017-11-30T18:30:00Z,E2AUK,166.150314
4,2017-12-01T18:30:00Z,E2AUK,156.98563


In [59]:
test_labels

Unnamed: 0,datetime,grid_id,value
0,2017-01-07T16:00:00Z,1X116,0.0
1,2017-01-07T16:00:00Z,9Q6TA,0.0
2,2017-01-07T16:00:00Z,KW43U,0.0
3,2017-01-07T16:00:00Z,VR4WG,0.0
4,2017-01-07T16:00:00Z,XJF9O,0.0
...,...,...,...
13499,2021-08-24T08:00:00Z,QJHW4,0.0
13500,2021-08-24T08:00:00Z,VBLD0,0.0
13501,2021-08-24T08:00:00Z,WT52R,0.0
13502,2021-08-24T08:00:00Z,ZP1FZ,0.0


In [68]:
final = pd.merge(test_labels,test_cal,on = ["datetime","grid_id"]).drop(columns = ["value"])

In [69]:
final.head()

Unnamed: 0,datetime,grid_id,calc_value
0,2017-01-07T16:00:00Z,1X116,12.719368
1,2017-01-07T16:00:00Z,9Q6TA,14.291667
2,2017-01-07T16:00:00Z,KW43U,12.565217
3,2017-01-07T16:00:00Z,VR4WG,15.853261
4,2017-01-07T16:00:00Z,XJF9O,14.264302


In [70]:
final.columns = ["datetime","grid_id","value"]

In [73]:
final.to_csv("dumbSameMonthAverage.csv",index=False)