In [5]:
import numpy as np
import shogun as sg

In [6]:
from pathlib import Path

COUNTRIES = ['austria', 'belgium', 'germany', 'italy', 'netherlands']
LANGUAGE = {'austria'    : 'german',
            'belgium'    : 'dutch',
            'germany'    : 'german',
            'italy'      : 'italian',
            'netherlands': 'dutch'}

PREFIX = {'german': 'de', 'dutch': 'nl', 'italian': 'it'}

POPULATION = {'austria'    : 9003354,
              'belgium'    : 11586640,
              'germany'    : 83768122,
              'italy'      : 60467045,
              'netherlands': 17132636}

processed_data_path = Path.cwd() / 'data' / 'processed'
# keywords_path = Path.cwd() / 'revised_keywords'
# models_path = Path.cwd() / 'influenza_estimator' / 'models'

years = [2007 + i for i in range(13)]

LOG_FILENAME = str(
    (Path.cwd() / 'influenza_estimator' / 'information.log').absolute())

In [7]:
import pandas as pd


def load_features(path):
    if path.exists() and path.is_file():
        df = pd.read_csv(path)
        features = df.drop(columns=['incidence'])
        return features.values


def load_labels(path):
    if path.exists() and path.is_file():
        df = pd.read_csv(path)
        labels = pd.Series(df['incidence'])
        return labels.values


def load(path, is_labels=False):
    if path.exists() and path.is_file():
        df = pd.read_csv(path)
        if is_labels:
            df = pd.Series(df['incidence'])
        return df.values

In [8]:

alpha = {
    'austria':19.5,
    'belgium':84,
    'germany':37.5,
    'italy':57,
    'netherlands':84
}

main_data_path = Path.cwd() / 'data'
df = {}

for country in COUNTRIES:
            print('For '+country)
            main_file_path = main_data_path / (country + '.csv')
            df[country] = pd.read_csv(main_file_path)
            print(df[country].shape)


            x_train_file_path = processed_data_path / (country + '_features.csv')
            y_train_file_path = processed_data_path / (country + '_labels.csv')
            features_train = sg.create_features(load(x_train_file_path).T)
            labels_train = sg.create_labels(load(y_train_file_path, is_labels=True))

            lrr = sg.create_machine("LinearRidgeRegression", tau=alpha[country], labels=labels_train, use_bias=False)
            lrr.train(features_train)
            labels_train_predict = lrr.apply(features_train)
            y_train_predicted = labels_train_predict.get("labels").reshape(labels_train_predict.get("labels").shape[0])
            print(y_train_predicted.shape)
            df[country]['estimate_lrr'] = y_train_predicted

            mean_rule = sg.create_combination_rule("MeanRule")
            rand_forest = sg.create_machine("RandomForest",
                                            labels=labels_train,
                                            num_bags=5, seed=1,
                                            combination_rule=mean_rule)

            rand_forest.train(features_train)
            labels_train_predict = rand_forest.apply_regression(features_train)
            y_train_predicted = labels_train_predict.get("labels").reshape(labels_train_predict.get("labels").shape[0])
            df[country]['estimate_rf'] = y_train_predicted

            glm = sg.create_machine("GLM", labels=labels_train, alpha=0.17, learning_rate=0.002, max_iterations=1000, tolerance=0.000001, eta=0.2)
            glm.put("lambda", 0.001)
            glm.train(features_train)
            labels_train_predict = glm.apply(features_train)
            y_train_predicted = labels_train_predict.get("labels").reshape(labels_train_predict.get("labels").shape[0])
            print(y_train_predicted)
            df[country]['estimate_p'] = y_train_predicted

#             random_forest[country] = rand_forest


For austria
(156, 305)
(156,)
[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan]
For belgium
(260, 296)
(260,)
[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan n

In [9]:
df['italy'].head()

Unnamed: 0.1,Unnamed: 0,incidence,Acrocianosi,Acroosteolisi,Adiadococinesia,Alfuy_virus,Alitosi,Allucinazione_uditiva,Anatossina,Anemia_infettiva_equina,...,Xantoma,Xerosi,Yaounde_virus,Yokose_virus,date,estimate,week,estimate_lrr,estimate_rf,estimate_p
0,0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2007-10-21,,2007-42,0.002335,0.424473,
1,1,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2007-10-28,,2007-43,0.004615,0.544473,
2,2,0.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2007-11-04,,2007-44,0.005049,0.438473,
3,3,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2007-11-11,,2007-45,0.005587,0.544473,
4,4,0.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2007-11-18,,2007-46,0.008879,0.544473,


In [11]:
final_data_path = Path.cwd() / 'data' / 'final'

for country in COUNTRIES:
    final_data_file = final_data_path / (country + '.csv')
    df[country].to_csv(final_data_file);