In [1]:
import os
import tempfile as tmp
import warnings

import sklearn.datasets
import sklearn.model_selection

os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from autoPyTorch.api.tabular_regression import TabularRegressionTask

In [3]:
import pandas as pd


In [4]:
df = pd.read_csv("processed_data_l.csv")

In [5]:
df = df.drop(columns=["Unnamed: 0","datetime","granule_id"])

In [6]:
df["location"]

0        Los Angeles (SoCAB)
1        Los Angeles (SoCAB)
2        Los Angeles (SoCAB)
3        Los Angeles (SoCAB)
4        Los Angeles (SoCAB)
                ...         
39445                  Delhi
39446                  Delhi
39447                  Delhi
39448                  Delhi
39449                  Delhi
Name: location, Length: 39450, dtype: object

In [7]:
df["datetime_dt"] = pd.to_datetime(df["datetime_dt"] )

In [8]:
def clean_df(df):
    df = df.drop(columns=["Unnamed: 0","datetime","granule_id"])
    df["datetime_dt"] = pd.to_datetime(df["datetime_dt"] )
    df["month"] = [i.month for i in df["datetime_dt"]]
    df["month"] = df["month"].astype(str).astype("category")
    one_hot = pd.get_dummies(df["month"])
    df = df.join(one_hot)
    df = df.drop('month',axis = 1)
    df = df.drop('date',axis = 1)
    one_hot = pd.get_dummies(df["location"])
    df = df.join(one_hot)
    df = df.drop('location',axis = 1)
    one_hot = pd.get_dummies(df["grid_id"])
    df = df.join(one_hot)
    df = df.drop('grid_id',axis = 1)
    return df

In [9]:
df["month"] = [i.month for i in df["datetime_dt"]]

In [10]:
df["month"] = df["month"].astype(str).astype("category")

In [11]:
one_hot = pd.get_dummies(df["month"])
one_hot

Unnamed: 0,1,10,11,12,2,3,4,5,6,7,8,9
0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
39445,0,0,0,0,0,0,1,0,0,0,0,0
39446,0,0,0,0,0,0,1,0,0,0,0,0
39447,0,0,0,0,0,0,1,0,0,0,0,0
39448,0,0,0,0,0,0,1,0,0,0,0,0


In [12]:
df = df.join(one_hot)

In [13]:
df = df.drop('month',axis = 1)

In [14]:
df = df.drop('date',axis = 1)

In [15]:
one_hot = pd.get_dummies(df["location"])
df = df.join(one_hot)

In [16]:
df = df.drop('location',axis = 1)

In [17]:
one_hot = pd.get_dummies(df["grid_id"])
df = df.join(one_hot)

In [18]:
df = df.drop('grid_id',axis = 1)

In [19]:
df.head()

Unnamed: 0,value,datetime_dt,AOD at 0.47 micron_0,AOD at 0.47 micron_1,AOD at 0.47 micron_2,AOD at 0.47 micron_3,AOD at 0.47 micron_4,AOD at 0.47 micron_5,AOD at 0.47 micron_6,AOD at 0.47 micron_7,...,VYH7U,WT52R,WZNCR,X5DKW,XJF9O,XNLVD,YHOPV,ZF3ZW,ZP1FZ,ZZ8JF
0,11.4,2018-02-01 08:00:00+00:00,-0.563344,0.109907,-0.103034,0.234161,-0.041027,0.229606,-0.581547,-0.097234,...,0,0,0,0,0,0,0,0,0,0
1,17.0,2018-02-01 08:00:00+00:00,-0.563344,0.109907,-0.103034,0.234161,-0.041027,0.229606,-0.581547,-0.097234,...,0,0,0,0,0,0,0,0,0,0
2,11.1,2018-02-01 08:00:00+00:00,-0.563344,0.109907,-0.103034,0.234161,-0.041027,0.229606,-0.581547,-0.097234,...,0,0,0,0,0,0,0,0,0,0
3,22.1,2018-02-01 08:00:00+00:00,-0.563344,0.109907,-0.103034,0.234161,-0.041027,0.229606,-0.581547,-0.097234,...,0,0,0,0,0,0,0,0,0,0
4,29.8,2018-02-01 08:00:00+00:00,-0.563344,0.109907,-0.103034,0.234161,-0.041027,0.229606,-0.581547,-0.097234,...,0,0,0,0,0,0,0,0,0,0


In [20]:
X = df[df.columns[2:]].values

In [21]:
y = df["value"].values

In [22]:
X.shape

(39450, 2117)

In [24]:
X.shape

(39450, 2117)

In [27]:
X_f = X[:100]

In [28]:
y_f = y[:100]

In [29]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X_f,
    y_f,
    random_state=1,
)


In [30]:
api = TabularRegressionTask()

In [31]:
api.search(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test.copy(),
    y_test=y_test.copy(),
    optimize_metric='r2',
    total_walltime_limit=300,
    func_eval_time_limit_secs=500,
    memory_limit=None
)



<autoPyTorch.api.tabular_regression.TabularRegressionTask at 0x7fdd248e3fd0>

In [36]:
for i in api.show_models().split("\n"):
    print(i)

|    | Preprocessing                                  | Estimator                                                    |   Weight |
|---:|:-----------------------------------------------|:-------------------------------------------------------------|---------:|
|  0 | None                                           | LGBMLearner                                                  |     0.72 |
|  1 | SimpleImputer,NoEncoder,MinMaxScaler,KernelPCA | no embedding,ResNetBackbone,FullyConnectedHead,nn.Sequential |     0.28 |


In [37]:
y_pred = api.predict(X_test)

In [39]:
y_pred,y_test

(array([[10.37222445],
        [20.38568234],
        [24.19297266],
        [12.93878651],
        [26.87248802],
        [25.39304876],
        [ 9.42792082],
        [17.13818407],
        [ 6.68472338],
        [24.69935274],
        [25.1283946 ],
        [11.35762119],
        [19.74185371],
        [23.82077503],
        [19.81522942],
        [10.70577145],
        [23.39290857],
        [ 3.32666504],
        [ 9.80878496],
        [13.93182325],
        [18.57106876],
        [13.71081209],
        [15.27664089],
        [19.91833067],
        [26.74576855]]),
 array([11.4       , 16.        , 25.88888889, 22.3       , 10.15      ,
        12.8       , 10.6       , 19.5       , 12.6       , 29.        ,
        30.975     , 11.3       , 18.        , 10.4       , 17.9       ,
         7.05      ,  7.97142857,  4.6       , 18.8       ,  6.2       ,
        11.1       , 10.        , 15.85      ,  6.8       ,  0.4       ]))

In [216]:
from sklearn.preprocessing import MinMaxScaler

In [217]:
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)
# X_test = scaler.transform(X_test)
# X_train, y_train = np.array(X_train), np.array(y_train)
# X_val, y_val = np.array(X_val), np.array(y_val)
# X_test, y_test = np.array(X_test), np.array(y_test)


In [402]:
train_dataset = RegressionDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
val_dataset = RegressionDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())
test_dataset = RegressionDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())

In [403]:
EPOCHS = 150
BATCH_SIZE = 64
LEARNING_RATE = 0.001
NUM_FEATURES = X.shape[1]

In [404]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=1)
test_loader = DataLoader(dataset=test_dataset, batch_size=1)

In [405]:
NUM_FEATURES

2117

In [406]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [505]:
model = Regression(NUM_FEATURES)
model.to(device)


Regression(
  (fc1): Linear(in_features=2117, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=256, bias=True)
  (fc4): Linear(in_features=256, out_features=128, bias=True)
  (fc5): Linear(in_features=128, out_features=64, bias=True)
  (fc6): Linear(in_features=64, out_features=64, bias=True)
  (fc7): Linear(in_features=64, out_features=64, bias=True)
  (fc8): Linear(in_features=64, out_features=64, bias=True)
  (fc9): Linear(in_features=64, out_features=64, bias=True)
  (fc10): Linear(in_features=64, out_features=32, bias=True)
  (f): Linear(in_features=32, out_features=1, bias=True)
  (activation): LeakyReLU(negative_slope=0.01)
)

In [506]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [507]:
loss_stats = {
    'train': [],
    "val": []
}

In [508]:
from tqdm import tqdm

In [None]:
print("Begin training.")
for e in tqdm(range(1, EPOCHS+1)):
    
    # TRAINING
    train_epoch_loss = 0
    model.train()
    for X_train_batch, y_train_batch in train_loader:
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        optimizer.zero_grad()
        
        y_train_pred = model(X_train_batch)
        
        train_loss = criterion(y_train_pred, y_train_batch.unsqueeze(1))
        
        train_loss.backward()
        optimizer.step()
        
        train_epoch_loss += train_loss.item()
        
        
    # VALIDATION    
    with torch.no_grad():
        
        val_epoch_loss = 0
        
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            
            y_val_pred = model(X_val_batch)
                        
            val_loss = criterion(y_val_pred, y_val_batch.unsqueeze(1))
            
            val_epoch_loss += val_loss.item()
    loss_stats['train'].append(train_epoch_loss/len(train_loader))
    loss_stats['val'].append(val_epoch_loss/len(val_loader))                              
    
    print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f}')

Begin training.


  1% 1/150 [00:06<16:46,  6.76s/it]

Epoch 001: | Train Loss: 3773.89483 | Val Loss: 2560.32142


  1% 2/150 [00:13<16:31,  6.70s/it]

Epoch 002: | Train Loss: 2238.93790 | Val Loss: 2094.86254


  2% 3/150 [00:20<16:18,  6.65s/it]

Epoch 003: | Train Loss: 1973.36540 | Val Loss: 1999.34675


  3% 4/150 [00:26<16:09,  6.64s/it]

Epoch 004: | Train Loss: 1968.28319 | Val Loss: 1626.02725


  3% 5/150 [00:33<15:58,  6.61s/it]

Epoch 005: | Train Loss: 1863.00830 | Val Loss: 1634.09209


  4% 6/150 [00:39<15:46,  6.57s/it]

Epoch 006: | Train Loss: 1788.52122 | Val Loss: 1551.14501


  5% 7/150 [00:46<15:41,  6.59s/it]

Epoch 007: | Train Loss: 1794.62616 | Val Loss: 1728.42119


  5% 8/150 [00:52<15:33,  6.57s/it]

Epoch 008: | Train Loss: 1748.11077 | Val Loss: 1595.73587


  6% 9/150 [00:59<15:27,  6.57s/it]

Epoch 009: | Train Loss: 1670.86183 | Val Loss: 1584.61437


  7% 10/150 [01:06<15:20,  6.58s/it]

Epoch 010: | Train Loss: 1652.21149 | Val Loss: 1743.92847


  7% 11/150 [01:12<15:13,  6.57s/it]

Epoch 011: | Train Loss: 1609.69633 | Val Loss: 1710.93439


  8% 12/150 [01:19<15:06,  6.57s/it]

Epoch 012: | Train Loss: 1672.89795 | Val Loss: 1524.94078


  9% 13/150 [01:25<14:57,  6.55s/it]

Epoch 013: | Train Loss: 1551.07877 | Val Loss: 1385.97346


  9% 14/150 [01:32<14:54,  6.57s/it]

Epoch 014: | Train Loss: 1518.68990 | Val Loss: 1288.28449


 10% 15/150 [01:38<14:46,  6.56s/it]

Epoch 015: | Train Loss: 1549.15791 | Val Loss: 1466.25181


 11% 16/150 [01:45<14:38,  6.56s/it]

Epoch 016: | Train Loss: 1446.07377 | Val Loss: 1305.80729


 11% 17/150 [01:51<14:31,  6.55s/it]

Epoch 017: | Train Loss: 1425.65946 | Val Loss: 1253.82654


 12% 18/150 [01:58<14:24,  6.55s/it]

Epoch 018: | Train Loss: 1445.07893 | Val Loss: 1675.29259


 13% 19/150 [02:04<14:16,  6.54s/it]

Epoch 019: | Train Loss: 1379.75004 | Val Loss: 1292.91257


 13% 20/150 [02:11<14:10,  6.54s/it]

Epoch 020: | Train Loss: 1311.39758 | Val Loss: 1271.01933


 14% 21/150 [02:17<14:01,  6.53s/it]

Epoch 021: | Train Loss: 1314.17546 | Val Loss: 1353.11926


 15% 22/150 [02:24<13:55,  6.53s/it]

Epoch 022: | Train Loss: 1332.36760 | Val Loss: 1411.56796


In [229]:
y_pred_list = []
with torch.no_grad():
    model.eval()
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_pred_list.append(y_test_pred.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [41]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [44]:
mse = mean_squared_error(y_test, y_pred_list)
r_square = r2_score(y_test, y_pred_list)
print("Mean Squared Error :",mse)
print("R^2 :",r_square)

NameError: name 'y_pred_list' is not defined

In [None]:
# Mean Squared Error : 907.6883032426905
# R^2 : 0.8434949212954438

# Mean Squared Error : 604.0072439588212
# R^2 : 0.8958560984908686

In [None]:
train_dataset = RegressionDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())

In [248]:
df_testi['7F1D1'] = np.zeros(len(df_testi))
df_testi['WZNCR'] = np.zeros(len(df_testi))

In [250]:
df_testi = df_testi[df.columns]

In [336]:
df_test.drop_duplicates()

Unnamed: 0.1,Unnamed: 0,datetime,grid_id,value,datetime_dt,location,date,granule_id,AOD at 0.47 micron_0,AOD at 0.47 micron_1,...,cosine of Solar Zenith Angle_246,cosine of Solar Zenith Angle_247,cosine of Solar Zenith Angle_248,cosine of Solar Zenith Angle_249,cosine of Solar Zenith Angle_250,cosine of Solar Zenith Angle_251,cosine of Solar Zenith Angle_252,cosine of Solar Zenith Angle_253,cosine of Solar Zenith Angle_254,cosine of Solar Zenith Angle_255
0,0,2017-01-07T16:00:00Z,1X116,8.420161,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,20170107T032000_maiac_tpe_0.hdf,-0.552611,0.114593,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
1,1,2017-01-07T16:00:00Z,9Q6TA,9.205665,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,20170107T032000_maiac_tpe_0.hdf,-0.552611,0.114593,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
2,2,2017-01-07T16:00:00Z,KW43U,10.190537,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,20170107T032000_maiac_tpe_0.hdf,-0.552611,0.114593,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
3,3,2017-01-07T16:00:00Z,VR4WG,13.024861,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,20170107T032000_maiac_tpe_0.hdf,-0.552611,0.114593,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
4,4,2017-01-07T16:00:00Z,XJF9O,8.132245,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,20170107T032000_maiac_tpe_0.hdf,-0.552611,0.114593,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15994,15994,2021-06-05T18:30:00Z,YHOPV,39.464722,2021-06-05 18:30:00+00:00,Delhi,2021-06-05,20210605T071000_maiac_dl_0.hdf,-0.561104,0.114857,...,0.282401,0.094115,0.101234,-0.598972,-0.056394,-0.085390,0.155456,0.061431,0.139627,0.156484
15995,15995,2021-06-05T18:30:00Z,ZF3ZW,53.499786,2021-06-05 18:30:00+00:00,Delhi,2021-06-05,20210605T071000_maiac_dl_0.hdf,-0.561104,0.114857,...,0.282401,0.094115,0.101234,-0.598972,-0.056394,-0.085390,0.155456,0.061431,0.139627,0.156484
15996,15996,2021-06-05T18:30:00Z,GVQXS,42.492672,2021-06-05 18:30:00+00:00,Delhi,2021-06-05,20210605T071000_maiac_dl_0.hdf,-0.561104,0.114857,...,0.282401,0.094115,0.101234,-0.598972,-0.056394,-0.085390,0.155456,0.061431,0.139627,0.156484
15997,15997,2021-06-17T18:30:00Z,A7UCQ,50.507221,2021-06-17 18:30:00+00:00,Delhi,2021-06-17,20210617T060000_maiac_dl_0.hdf,-0.556475,0.114697,...,0.364079,0.141253,0.068522,-1.220975,-0.168101,1.756613,0.158490,0.021657,0.110188,0.200711


In [241]:
df_testi = clean_df(df_test)

In [255]:
X_t.shape

(15999, 2117)

In [254]:
X_t = df_testi[df_testi.columns[2:]].values

In [256]:
y_t = df_testi["value"].values
df_testi

Unnamed: 0,value,datetime_dt,AOD at 0.47 micron_0,AOD at 0.47 micron_1,AOD at 0.47 micron_2,AOD at 0.47 micron_3,AOD at 0.47 micron_4,AOD at 0.47 micron_5,AOD at 0.47 micron_6,AOD at 0.47 micron_7,...,VYH7U,WT52R,WZNCR,X5DKW,XJF9O,XNLVD,YHOPV,ZF3ZW,ZP1FZ,ZZ8JF
0,0.0,2017-01-07 16:00:00+00:00,-0.552611,0.114593,-0.116292,0.256839,-0.049796,0.242716,-0.585104,-0.107607,...,0,0,0.0,0,0,0,0,0,0,0
1,0.0,2017-01-07 16:00:00+00:00,-0.552611,0.114593,-0.116292,0.256839,-0.049796,0.242716,-0.585104,-0.107607,...,0,0,0.0,0,0,0,0,0,0,0
2,0.0,2017-01-07 16:00:00+00:00,-0.552611,0.114593,-0.116292,0.256839,-0.049796,0.242716,-0.585104,-0.107607,...,0,0,0.0,0,0,0,0,0,0,0
3,0.0,2017-01-07 16:00:00+00:00,-0.552611,0.114593,-0.116292,0.256839,-0.049796,0.242716,-0.585104,-0.107607,...,0,0,0.0,0,0,0,0,0,0,0
4,0.0,2017-01-07 16:00:00+00:00,-0.552611,0.114593,-0.116292,0.256839,-0.049796,0.242716,-0.585104,-0.107607,...,0,0,0.0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15994,0.0,2021-06-05 18:30:00+00:00,-0.561104,0.114857,-0.115555,0.257440,-0.049277,0.244174,-0.593226,-0.107686,...,0,0,0.0,0,0,0,1,0,0,0
15995,0.0,2021-06-05 18:30:00+00:00,-0.561104,0.114857,-0.115555,0.257440,-0.049277,0.244174,-0.593226,-0.107686,...,0,0,0.0,0,0,0,0,1,0,0
15996,0.0,2021-06-05 18:30:00+00:00,-0.561104,0.114857,-0.115555,0.257440,-0.049277,0.244174,-0.593226,-0.107686,...,0,0,0.0,0,0,0,0,0,0,0
15997,0.0,2021-06-17 18:30:00+00:00,-0.556475,0.114697,-0.116058,0.256996,-0.049477,0.243355,-0.588765,-0.107504,...,0,0,0.0,0,0,0,0,0,0,0


In [257]:
test_inderence_dataset = RegressionDataset(torch.from_numpy(X_t).float(), torch.from_numpy(y_t).float())

In [263]:
test_inference_loader = DataLoader(dataset=test_inderence_dataset, batch_size=1,shuffle=False)

In [264]:
y_pred_list = []
with torch.no_grad():
    model.eval()
    for X_batch, _ in test_inderence_dataset:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_pred_list.append(y_test_pred.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [267]:
len(y_pred_list)

15999

In [419]:
df_test["value"]=y_pred_list

In [420]:
test_labels = pd.read_csv("submission_format.csv")
# test_labels["datetime"] = pd.to_datetime(test_labels["datetime"])
test_labels

Unnamed: 0,datetime,grid_id,value
0,2017-01-07T16:00:00Z,1X116,0.0
1,2017-01-07T16:00:00Z,9Q6TA,0.0
2,2017-01-07T16:00:00Z,KW43U,0.0
3,2017-01-07T16:00:00Z,VR4WG,0.0
4,2017-01-07T16:00:00Z,XJF9O,0.0
...,...,...,...
13499,2021-08-24T08:00:00Z,QJHW4,0.0
13500,2021-08-24T08:00:00Z,VBLD0,0.0
13501,2021-08-24T08:00:00Z,WT52R,0.0
13502,2021-08-24T08:00:00Z,ZP1FZ,0.0


In [470]:
test_labels["datetime"][0]

'2017-01-07T16:00:00Z'

In [471]:
df_final = df_test#[df_test["location"]=="Delhi"]

In [472]:
filtered =df_final[['grid_id','value','datetime']]

In [473]:
# filtered.columns = ['grid_id','value','datetime']

In [468]:
df_f = test_labels.merge(filtered,on=['grid_id','datetime'],how="left")

KeyError: 'grid_id'

In [469]:
df_f

Unnamed: 0,datetime,grid_id,value_x,value_y
0,2017-01-07T16:00:00Z,1X116,0.0,15.883646
1,2017-01-07T16:00:00Z,9Q6TA,0.0,16.128211
2,2017-01-07T16:00:00Z,KW43U,0.0,16.841695
3,2017-01-07T16:00:00Z,VR4WG,0.0,18.448220
4,2017-01-07T16:00:00Z,XJF9O,0.0,15.212121
...,...,...,...,...
13499,2021-08-24T08:00:00Z,QJHW4,0.0,
13500,2021-08-24T08:00:00Z,VBLD0,0.0,
13501,2021-08-24T08:00:00Z,WT52R,0.0,
13502,2021-08-24T08:00:00Z,ZP1FZ,0.0,


In [427]:
dates = list(test_labels["datetime"])

In [428]:
vals = pd.read_csv("dumbSameMonthAverage.csv")

In [480]:
filtered = filtered.groupby(["datetime","value"]).mean().reset_index()

In [481]:
finals = vals.merge(df_f,on=['grid_id','datetime'],how="left")

In [482]:
finals

Unnamed: 0,datetime,grid_id,value,value_x,value_y
0,2017-01-07T16:00:00Z,1X116,12.719368,0.0,15.883646
1,2017-01-07T16:00:00Z,9Q6TA,14.291667,0.0,16.128211
2,2017-01-07T16:00:00Z,KW43U,12.565217,0.0,16.841695
3,2017-01-07T16:00:00Z,VR4WG,15.853261,0.0,18.448220
4,2017-01-07T16:00:00Z,XJF9O,14.264302,0.0,15.212121
...,...,...,...,...,...
13499,2021-08-24T08:00:00Z,QJHW4,15.800317,0.0,
13500,2021-08-24T08:00:00Z,VBLD0,10.834375,0.0,
13501,2021-08-24T08:00:00Z,WT52R,15.986975,0.0,
13502,2021-08-24T08:00:00Z,ZP1FZ,19.875870,0.0,


In [500]:
resi = []
for i in finals[["value","value_y"]].values:
    if(i[1]!=i[1]):
        resi.append(i[0])
    else:
        resi.append(8/12 *i[0] + 5/12*i[1]  )

In [501]:
max(finals["value"]),max(resi)

(999.99, 728.5081307164241)

In [494]:
len(resi)

13504

In [495]:
finals["value3"] = resi

In [496]:
grdi =finals[["datetime","grid_id","value3"]]
grdi.columns = vals.columns

In [497]:
grdi.describe()

Unnamed: 0,value
count,13504.0
mean,39.332235
std,50.999803
min,0.435152
25%,10.372853
50%,14.240134
75%,51.039168
max,472.922821


In [498]:
grdi.to_csv("hardwork3.csv",index=False)

In [462]:
df_f = df_f.groupby(["datetime","grid_id"]).mean().reset_index()

In [463]:
# final = df_f[df_f["datetime"].isin(dates)]

In [474]:
final.groupby(["datetime","grid_id"]).mean().reset_index()

Unnamed: 0,datetime,grid_id,value_x,value_y
0,2017-01-07T16:00:00Z,1X116,0.0,15.883646
1,2017-01-07T16:00:00Z,9Q6TA,0.0,16.128211
2,2017-01-07T16:00:00Z,KW43U,0.0,16.841695
3,2017-01-07T16:00:00Z,VR4WG,0.0,18.448220
4,2017-01-07T16:00:00Z,XJF9O,0.0,15.212121
...,...,...,...,...
13499,2021-08-24T08:00:00Z,QJHW4,0.0,
13500,2021-08-24T08:00:00Z,VBLD0,0.0,
13501,2021-08-24T08:00:00Z,WT52R,0.0,
13502,2021-08-24T08:00:00Z,ZP1FZ,0.0,


In [465]:
test_labels.groupby(["datetime","grid_id"]).mean().reset_index()

Unnamed: 0,datetime,grid_id,value
0,2017-01-07T16:00:00Z,1X116,0.0
1,2017-01-07T16:00:00Z,9Q6TA,0.0
2,2017-01-07T16:00:00Z,KW43U,0.0
3,2017-01-07T16:00:00Z,VR4WG,0.0
4,2017-01-07T16:00:00Z,XJF9O,0.0
...,...,...,...
13499,2021-08-24T08:00:00Z,QJHW4,0.0
13500,2021-08-24T08:00:00Z,VBLD0,0.0
13501,2021-08-24T08:00:00Z,WT52R,0.0
13502,2021-08-24T08:00:00Z,ZP1FZ,0.0


In [287]:
filtered["datetime_dt"]= pd.to_datetime(filtered["datetime_dt"])

In [289]:
filtered.sort_values("datetime_dt")#.reset_index()["value"].plot()

Unnamed: 0,datetime_dt,value
0,2017-09-11 18:30:00+00:00,27.127584
1,2017-09-11 18:30:00+00:00,30.793737
2,2017-09-11 18:30:00+00:00,31.781155
3,2017-09-12 18:30:00+00:00,47.110882
4,2017-09-13 18:30:00+00:00,27.875738
...,...,...
4639,2021-07-24 18:30:00+00:00,16.941996
4638,2021-07-24 18:30:00+00:00,14.971571
4662,2021-07-24 18:30:00+00:00,32.055069
4649,2021-07-24 18:30:00+00:00,23.991646


In [399]:
# y_pred_list

In [502]:
class Regression(pl.LightningModule):
    
    def __init__(self,NUM_FEATURES):
        super(Regression, self).__init__()
        self.fc1 = nn.Linear(NUM_FEATURES, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, 128)
        self.fc5 = nn.Linear(128, 64)
        self.fc6 = nn.Linear(64, 64)
        self.fc7 = nn.Linear(64, 64)
        self.fc8 = nn.Linear(64, 64)
        self.fc9 = nn.Linear(64, 64)
        self.fc10 = nn.Linear(64, 32)
        self.f = nn.Linear(32, 1)
        self.activation = nn.LeakyReLU()
    

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.activation(self.fc4(x))
        x = self.activation(self.fc5(x))
        x = self.activation(self.fc6(x))
        x = self.activation(self.fc7(x))
        x = self.activation(self.fc8(x))
        x = self.activation(self.fc9(x))
        x = self.activation(self.fc10(x))
        x = self.f(x)
        return x

In [75]:
torch.tensor(X[0]).shape

torch.Size([2117])

In [81]:
n_samples = 5

In [82]:
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size=1, embedding_dim=128, hidden_dim=8, n_layers=8, drop_prob=0.5):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
       
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [83]:
import torch.optim as optim
model = LSTM()
criterion = nn.MSELoss()
optimiser = optim.Adam(model.parameters(), lr=0.08)

In [84]:
torch.tensor(X[:5]).shape,y.shape

(torch.Size([5, 2117]), (39450,))

In [85]:
model(torch.tensor(X[:5]),y[:5])

NameError: name 'input_t' is not defined

In [48]:
def training_loop(n_epochs, model, optimiser, loss_fn, 
                  train_input, train_target, test_input, test_target):
    for i in range(n_epochs):
        def closure():
            optimiser.zero_grad()
            out = model(train_input)
            loss = loss_fn(out, train_target)
            loss.backward()
            return loss
        optimiser.step(closure)
        with torch.no_grad():
            future = 1000
            pred = model(test_input, future=future)
            # use all pred samples, but only go to 999
            loss = loss_fn(pred[:, :-future], test_target)
            y = pred.detach().numpy()
        # print the loss
        out = model(train_input)
        loss_print = loss_fn(out, train_target)
        print("Step: {}, Loss: {}".format(i, loss_print))

In [50]:
train_prop = 0.95
train_samples = round(N * train_prop) 
test_samples = N - train_samples

In [51]:
train_input = torch.from_numpy(y[test_samples:, :-1]) # (train_samples, L-1)
train_target = torch.from_numpy(y[test_samples:, 1:]) # (train_samples, L-1)
test_input = torch.from_numpy(y[:test_samples, :-1]) # (train_samples, L-1)
test_target = torch.from_numpy(y[:test_samples, 1:]) # (train_samples, L-1)

In [53]:
N = 100 # number of theoretical series of games
L = 11 # number of games in each series
x = np.empty((N,L), np.float32) # instantiate empty array
x[:] = np.arange(L)
y = (1.6*x + 4).astype(np.float32)

# add some noise
for i in range(len(y)):
    y[i] += np.random.normal(10, 1)

In [55]:
training_loop(n_epochs = 10,
              model = model,
              optimiser = optimiser,
              loss_fn = criterion,
              L = L,
              train_input = train_input,
              train_target = train_target,
              test_input = test_input,
              test_target = test_target)

NameError: name 'n_samples' is not defined

In [8]:
df_values = df[df.columns[8:]].values

In [9]:
df_values.shape

(39450, 2046)

In [10]:
vectors=df_values
target = df_values[0]

In [237]:
df_test = pd.read_csv("processed_data_test.csv")

In [234]:
df_test= df_test.drop(columns=["Unnamed: 0","datetime","granule_id"])

In [235]:
df_test.head()

Unnamed: 0,grid_id,value,datetime_dt,location,date,AOD at 0.47 micron_0,AOD at 0.47 micron_1,AOD at 0.47 micron_2,AOD at 0.47 micron_3,AOD at 0.47 micron_4,...,cosine of Solar Zenith Angle_246,cosine of Solar Zenith Angle_247,cosine of Solar Zenith Angle_248,cosine of Solar Zenith Angle_249,cosine of Solar Zenith Angle_250,cosine of Solar Zenith Angle_251,cosine of Solar Zenith Angle_252,cosine of Solar Zenith Angle_253,cosine of Solar Zenith Angle_254,cosine of Solar Zenith Angle_255
0,1X116,0.0,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,-0.552611,0.114593,-0.116292,0.256839,-0.049796,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
1,9Q6TA,0.0,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,-0.552611,0.114593,-0.116292,0.256839,-0.049796,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
2,KW43U,0.0,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,-0.552611,0.114593,-0.116292,0.256839,-0.049796,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
3,VR4WG,0.0,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,-0.552611,0.114593,-0.116292,0.256839,-0.049796,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
4,XJF9O,0.0,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,-0.552611,0.114593,-0.116292,0.256839,-0.049796,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304


In [15]:
# import torch
# print(torch.rand(1, device="cuda"))

In [None]:
class LSTM(nn.Module):
    def __init__(self, hidden_layers=64):
        super(LSTM, self).__init__()
        self.hidden_layers = hidden_layers
        # lstm1, lstm2, linear are all layers in the network
        self.lstm1 = nn.LSTMCell(1, self.hidden_layers)
        self.lstm2 = nn.LSTMCell(self.hidden_layers, self.hidden_layers)
        self.linear = nn.Linear(self.hidden_layers, 1)
        
    def forward(self, y, future_preds=0):
        outputs, num_samples = [], y.size(0)
        h_t = torch.zeros(n_samples, self.hidden_layers, dtype=torch.float32)
        c_t = torch.zeros(n_samples, self.hidden_layers, dtype=torch.float32)
        h_t2 = torch.zeros(n_samples, self.hidden_layers, dtype=torch.float32)
        c_t2 = torch.zeros(n_samples, self.hidden_layers, dtype=torch.float32)
        
        for time_step in y.split(1, dim=1):
            # N, 1
            h_t, c_t = self.lstm1(input_t, (h_t, c_t)) # initial hidden and cell states
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2)) # new hidden and cell states
            output = self.linear(h_t2) # output from the last FC layer
            outputs.append(output)
            
        for i in range(future_preds):
            # this only generates future predictions if we pass in future_preds>0
            # mirrors the code above, using last output/prediction as input
            h_t, c_t = self.lstm1(output, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t2)
            outputs.append(output)
        # transform list to tensor    
        outputs = torch.cat(outputs, dim=1)
        return outputs