In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd


import missingno as msno
from scipy.special import boxcox1p, inv_boxcox1p


from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import math
from sklearn.svm import SVR  
import xgboost as xgb
from pickle import dump,load

from preprocessor import preprocess,NumericalPreprocess,CategoryPreprocess


def train_sj():
    
    dftrain_label = pd.read_csv('data/dengue_labels_train.csv')
    dftrain_feat = pd.read_csv('data/dengue_features_train.csv')
    dftrain = dftrain_feat.merge(dftrain_label,how='inner',on=['city','year','weekofyear'])
    dftrain_sj = dftrain[dftrain.city == 'sj']
    
    dftrain_sj_transformed = preprocess(dftrain_sj,'train')
    dftrain_sj_transformed.total_cases = boxcox1p(dftrain_sj_transformed['total_cases'],0.6)
    dftrain_sj_transformed[~dftrain_sj_transformed.year.isin([1990,1991,1992,1993,1994,1995,1996,1997,1998,1999])]
    
    imp_cols_sj = ['year', 'ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 
                'reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k', 
                'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k', 'reanalysis_precip_amt_kg_per_m2', 
                'reanalysis_relative_humidity_percent', 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k', 
                'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c', 'station_min_temp_c', 
                'station_precip_mm', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 
                'month_9', 'month_10', 'month_11', 'month_12', 'weekofyear_1', 'weekofyear_2', 'weekofyear_3', 'weekofyear_4', 
                'weekofyear_5', 'weekofyear_6', 'weekofyear_8', 'weekofyear_9', 'weekofyear_11', 'weekofyear_12', 
                'weekofyear_13', 'weekofyear_14', 'weekofyear_15', 'weekofyear_16', 'weekofyear_17', 'weekofyear_19',
                'weekofyear_20', 'weekofyear_21', 'weekofyear_22', 'weekofyear_23', 'weekofyear_24', 'weekofyear_25', 
                'weekofyear_26', 'weekofyear_27', 'weekofyear_28', 'weekofyear_29', 'weekofyear_30', 'weekofyear_31', 
                'weekofyear_32', 'weekofyear_33', 'weekofyear_34', 'weekofyear_35', 'weekofyear_36', 'weekofyear_37', 
                'weekofyear_38', 'weekofyear_39', 'weekofyear_40', 'weekofyear_41', 'weekofyear_42', 'weekofyear_43', 
                'weekofyear_44', 'weekofyear_45', 'weekofyear_46', 'weekofyear_48', 'weekofyear_49', 'weekofyear_50', 
                'weekofyear_51', 'weekofyear_52', 'weekofyear_53', 'ndvi_avg']

    
    model = xgb.XGBRegressor(n_estimators = 1000, tree_method = 'hist', max_depth = 6, learning_rate =0.05
                             , min_child_weight = 2, gammma = 1, eval_metric = 'mae')
    
    model.fit(dftrain_sj_transformed[imp_cols_sj],dftrain_sj_transformed.total_cases)
    
    dump(model, open('model/model_sj.pkl', 'wb'))
    
    
    
def train_iq():
    
    dftrain_label = pd.read_csv('data/dengue_labels_train.csv')
    dftrain_feat = pd.read_csv('data/dengue_features_train.csv')
    dftrain = dftrain_feat.merge(dftrain_label,how='inner',on=['city','year','weekofyear'])
    dftrain_iq = dftrain[dftrain.city == 'iq']
    
    dftrain_iq_transformed = preprocess(dftrain_iq,'train')
    dftrain_iq_transformed.total_cases = boxcox1p(dftrain_iq_transformed['total_cases'],0.2)
    
    dftrain_iq_transformed[~dftrain_iq_transformed.year.isin([2000,2001])]
    
    imp_cols_iq = ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k', 
               'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
               'reanalysis_min_air_temp_k', 'reanalysis_precip_amt_kg_per_m2', 'reanalysis_relative_humidity_percent', 
               'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k', 'station_avg_temp_c', 'station_diur_temp_rng_c', 
               'station_max_temp_c', 'station_min_temp_c', 'station_precip_mm', 'month_1', 'month_2',
               'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
               'month_12', 'weekofyear_1', 'weekofyear_2', 'weekofyear_3', 'weekofyear_4', 'weekofyear_5', 'weekofyear_6',
               'weekofyear_7', 'weekofyear_8', 'weekofyear_9', 'weekofyear_10', 'weekofyear_11', 'weekofyear_12',
               'weekofyear_13', 'weekofyear_14', 'weekofyear_15', 'weekofyear_16', 'weekofyear_17', 'weekofyear_18', 
               'weekofyear_19', 'weekofyear_20', 'weekofyear_21', 'weekofyear_22', 'weekofyear_23', 'weekofyear_24',
               'weekofyear_25', 'weekofyear_26', 'weekofyear_27', 'weekofyear_28', 'weekofyear_29', 'weekofyear_30', 
               'weekofyear_31', 'weekofyear_32', 'weekofyear_33', 'weekofyear_34', 'weekofyear_35', 'weekofyear_36', 
               'weekofyear_37', 'weekofyear_38', 'weekofyear_39', 'weekofyear_40', 'weekofyear_41', 'weekofyear_42', 
               'weekofyear_43', 'weekofyear_44', 'weekofyear_45', 'weekofyear_46', 'weekofyear_47', 'weekofyear_48', 
               'weekofyear_49', 'weekofyear_50', 'weekofyear_51', 'ndvi_avg']

    
    model = SVR(gamma = 'scale', C = 1, epsilon = 0.2)
    
    model.fit(dftrain_iq_transformed[imp_cols_iq],dftrain_iq_transformed.total_cases)
    
    dump(model, open('model/model_iq.pkl', 'wb'))
    
def main():
    train_sj()
    train_iq()
    

if __name__ == '__main__':
    main()