In [55]:
import numpy as np
import pandas as pd
import tensorflow as tf
tf.random.set_seed(42)
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_squared_error,r2_score
pd.set_option("display.precision", 2)

In [56]:
train=pd.read_csv("/content/dengue_features_train_with_out.csv")

In [57]:
train.head()

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
0,sj,1990,18,1990-04-30,0.12,0.1,0.2,0.18,12.42,297.57,297.74,292.41,299.8,295.9,32.0,73.37,12.42,14.01,2.63,25.44,6.9,29.4,20.0,16.0,4
1,sj,1990,19,1990-05-07,0.17,0.14,0.16,0.16,22.82,298.21,298.44,293.95,300.9,296.4,17.94,77.37,22.82,15.37,2.37,26.71,6.37,31.7,22.2,8.6,5
2,sj,1990,20,1990-05-14,0.03,0.17,0.16,0.17,34.54,298.78,298.88,295.43,300.5,297.3,26.1,82.05,34.54,16.85,2.3,26.71,6.49,32.2,22.8,41.4,4
3,sj,1990,21,1990-05-21,0.13,0.25,0.23,0.24,15.36,298.99,299.23,295.31,301.4,297.0,13.9,80.34,15.36,16.67,2.43,27.47,6.77,33.3,23.3,4.0,3
4,sj,1990,22,1990-05-28,0.2,0.26,0.25,0.25,7.52,299.52,299.66,295.82,301.9,297.5,12.2,80.46,7.52,17.21,3.01,28.94,9.37,35.0,23.9,5.8,6


In [58]:
train=train.interpolate(kind='linear',limit_direction='forward')

In [59]:
def preprocess_data(data,norm_cols=[],scale_cols=[],train_scale=None):
  df=data.copy()
  if train_scale is None:
    train_scale=data
  if norm_cols:
    df[norm_cols]=StandardScaler().fit(train_scale[norm_cols]).transform(df[norm_cols])
  if scale_cols:
    df[scale_cols]=MinMaxScaler(feature_range=(0,1)).fit(train_scale[scale_cols]).transform(df[scale_cols])
  return df
def generate_multivariate_data(data,history_size=12,target_size=1,train_fraction=1,target_col=-1):
  datasets=[]
  labels=data[:,target_col]
  data=data[:,:target_col]
  data_size=len(data)
  start_idx=history_size
  train_to_idx=int(data_size*train_fraction) if train_fraction!=1 else data_size-target_size
  val_idx=train_to_idx+history_size
  end_idx=data_size-target_size
  indexes=[(start_idx,train_to_idx)]
  if train_fraction!=1:
    indexes.append((val_idx,end_idx))
  for start,end in indexes:
    d=[]
    y=[]
    for i in range(start,end):
      indices=range(i-history_size,i)
      d.append(data[indices])
      y.append(labels[i-1])
    datasets.append((np.array(d),np.array(y)))
  return datasets
def generate_lstm_data(dataframe,cols=[],scale_cols=[],norm_cols=[],history_size=12,target_size=1,train_fraction=1,target_col=-1,prepend_file=None,train_scale=None):
  df=dataframe[cols].copy()
  if "total_cases" not in df.columns:
    df["total_cases"]=np.zeros((len(df),1))
  if prepend_file is not None:
    prepend=prepend_file[cols].copy()
  datasets=[]
  for city in ["sj","iq"]:
    city_df=df[df['city']==city]
    if prepend_file is not None:
      city_df=prepend[prepend['city']==city].iloc[-(history_size+1):].append(city_df,ignore_index=True)
    train_scale=city_df.copy()
    city_df.index=city_df['week_start_date']
    city_df=preprocess_data(city_df[norm_cols+scale_cols+["total_cases"]],norm_cols=norm_cols,scale_cols=scale_cols,
                            train_scale=train_scale)
    datasets.append(city_df.values)
  return list(map(lambda x: generate_multivariate_data(x,history_size=history_size,train_fraction=train_fraction),datasets))




  

In [60]:
new_iq_norm = [
                'reanalysis_tdtr_k',
                'reanalysis_precip_amt_kg_per_m2',
                'reanalysis_relative_humidity_percent',
                'station_avg_temp_c',
                'station_min_temp_c',
                'reanalysis_dew_point_temp_k',
                'reanalysis_specific_humidity_g_per_kg',
                'reanalysis_min_air_temp_k'
]
new_iq_scale = [
                   'year',
]

new_sj_norm = [
                'precipitation_amt_mm',
                'reanalysis_air_temp_k',
                'reanalysis_avg_temp_k',
                'reanalysis_max_air_temp_k',
                'reanalysis_min_air_temp_k',
                'reanalysis_precip_amt_kg_per_m2',
                'reanalysis_relative_humidity_percent',
                'reanalysis_sat_precip_amt_mm',
                'station_avg_temp_c',
                'station_max_temp_c',
                'station_min_temp_c',
                
]
new_sj_scale = [
                   'weekofyear'
                
]
sj_cols=new_sj_norm+new_sj_scale+["total_cases","city","week_start_date"]
iq_cols=new_iq_norm+new_iq_scale+["total_cases","city","week_start_date"]

In [61]:
sj_datasets=generate_lstm_data(train,cols=sj_cols,scale_cols=new_sj_scale,norm_cols=new_sj_norm,history_size=32,prepend_file=train)
iq_datasets=generate_lstm_data(train,cols=iq_cols,scale_cols=new_iq_scale,norm_cols=new_iq_norm,history_size=32,prepend_file=train)

In [62]:
(sj_train_x, sj_train_y) = sj_datasets[0][0]
(iq_train_x, iq_train_y) = iq_datasets[1][0]
sj_train_x = sj_train_x.reshape(sj_train_x.shape[0], sj_train_x.shape[1] * sj_train_x.shape[2])
iq_train_x = iq_train_x.reshape(iq_train_x.shape[0], iq_train_x.shape[1] * iq_train_x.shape[2])

In [63]:
def build_model(optimizer = None, nodes=256, input_shape=sj_train_x.shape[-1]):
  model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=input_shape),
    tf.keras.layers.Dense(nodes, activation='selu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(nodes/2, activation='selu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
  ])

  if not optimizer:
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.9999, amsgrad=False)

  model.compile(loss='mae',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

print(sj_train_x.shape[-2:])

(936, 384)


In [64]:
EVALUATION_INTERVAL = 200
EPOCHS = 8
BUFFER_SIZE=500
BATCH_SIZE=16
# opt = tf.keras.optimizers.Adam(learning_rate=0.1, beta_1=0.9, beta_2=0.9999, amsgrad=False)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="mae", factor=0.8, patience=3, min_lr=1e-6, verbose=1,
                                                     mode="max")

train_sj_data_single = tf.data.Dataset.from_tensor_slices((sj_train_x, sj_train_y))
train_sj_data_single = train_sj_data_single.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()

opt = tf.keras.optimizers.RMSprop(
    learning_rate=0.01, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False,
    name='RMSprop'
)
sj_model = build_model(optimizer=opt, nodes=80)
history = sj_model.fit(
    train_sj_data_single,
    epochs=EPOCHS, 
    steps_per_epoch=EVALUATION_INTERVAL,
    verbose=1,
    callbacks=[reduce_lr])

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.007999999821186066.
Epoch 5/8
Epoch 6/8
Epoch 7/8

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.006399999558925629.
Epoch 8/8


In [65]:
train_iq_data_single = tf.data.Dataset.from_tensor_slices((iq_train_x, iq_train_y))
train_iq_data_single = train_iq_data_single.cache().shuffle(500).batch(16).repeat()

EPOCHS = 22
# opt = tf.keras.optimizers.Adam(learning_rate=0.1, beta_1=0.9, beta_2=0.9999, amsgrad=False)
opt = tf.keras.optimizers.RMSprop(
    learning_rate=0.001, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False,
    name='RMSprop'
)
iq_model = build_model(optimizer=opt, nodes=80, input_shape=iq_train_x.shape[-1])
train_iq_data_single = tf.data.Dataset.from_tensor_slices((iq_train_x, iq_train_y))
train_iq_data_single = train_iq_data_single.cache().batch(BATCH_SIZE).repeat()

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="mae", factor=0.8, patience=5, min_lr=1e-6, verbose=1,
                                                     mode="max")
history = iq_model.fit(
    train_iq_data_single,
    epochs=EPOCHS, 
    steps_per_epoch=EVALUATION_INTERVAL,
    verbose=1,
    callbacks=[ reduce_lr])

Epoch 1/22
Epoch 2/22
Epoch 3/22
Epoch 4/22
Epoch 5/22
Epoch 6/22

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 7/22
Epoch 8/22
Epoch 9/22
Epoch 10/22
Epoch 11/22

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 12/22
Epoch 13/22
Epoch 14/22
Epoch 15/22
Epoch 16/22

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 17/22
Epoch 18/22
Epoch 19/22
Epoch 20/22
Epoch 21/22

Epoch 00021: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 22/22


In [66]:
test_sj_cols=['precipitation_amt_mm',
 'reanalysis_air_temp_k',
 'reanalysis_avg_temp_k',
 'reanalysis_max_air_temp_k',
 'reanalysis_min_air_temp_k',
 'reanalysis_precip_amt_kg_per_m2',
 'reanalysis_relative_humidity_percent',
 'reanalysis_sat_precip_amt_mm',
 'station_avg_temp_c',
 'station_max_temp_c',
 'station_min_temp_c',
 'weekofyear',
 'city',
 'week_start_date']
test_iq_cols=['reanalysis_tdtr_k', 'reanalysis_precip_amt_kg_per_m2', 'reanalysis_relative_humidity_percent', 'station_avg_temp_c', 'station_min_temp_c', 'reanalysis_dew_point_temp_k', 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_min_air_temp_k', 'year', 'city', 'week_start_date']


In [67]:
test=pd.read_csv("/content/dengue_features_test.csv")
test=test.interpolate(kind='linear',limit_direction='forward')
sj_test=generate_lstm_data(test,cols=test_sj_cols,scale_cols=new_sj_scale,norm_cols=new_sj_norm,history_size=32,prepend_file=train)
iq_test=generate_lstm_data(test,cols=test_iq_cols,scale_cols=new_iq_scale,norm_cols=new_iq_norm,history_size=32,prepend_file=train)
(sj_test_x, sj_test_y), = sj_test[0]
(iq_test_x, iq_test_y), = iq_test[1]
sj_test_x = sj_test_x.reshape(sj_test_x.shape[0], sj_test_x.shape[1] * sj_test_x.shape[2])
iq_test_x = iq_test_x.reshape(iq_test_x.shape[0], iq_test_x.shape[1] * iq_test_x.shape[2])
sj_test_set = tf.data.Dataset.from_tensor_slices((sj_test_x, sj_test_y)).batch(len(sj_test_y))

sj_pred = []
for x, y in sj_test_set.take(1):
    predictions = sj_model.predict(x)
    sj_pred = predictions.flatten()
    print(len(predictions.flatten()))

260


In [68]:
iq_test_set = tf.data.Dataset.from_tensor_slices((iq_test_x, iq_test_y)).batch(len(iq_test_y))
iq_pred = []
for x, y in iq_test_set.take(1):
    predictions = iq_model.predict(x)
    iq_pred = predictions.flatten()
    print(len(predictions.flatten()))

156


In [69]:
preds = np.concatenate((sj_pred, iq_pred), axis=None)

In [70]:
t=pd.read_csv("/content/dengue_features_test.csv")
t['total_cases']=preds
t['total_cases']=t['total_cases'].apply(lambda x: int(x) if x>0 else 0)
t[['city','year','weekofyear','total_cases']].to_csv("submission.csv",index=False)
