In [1]:
import os
import sys
import tqdm
import random
import numpy as np
import pandas as pd
from numpy.random import seed
from skopt import BayesSearchCV
from sklearn.base import BaseEstimator
from sklearn.metrics import mean_squared_error

sys.path.append('.')
sys.path.append('../../')
sys.path.append('../network_model/')

notebook_path = os.getcwd()
project_path = os.path.abspath(os.path.join(notebook_path, '..'))
sys.path.append(project_path)

from network_model.network_classes import Network
from networks_container import NetworksContainer
from estimator import Estimator

In [2]:
cherrypicked_seed = 42

seed(cherrypicked_seed)
random.seed(cherrypicked_seed)

In [3]:
dataset_palin = pd.read_csv("../datasets/palinAnonimized.csv", dtype=str)

In [4]:
endorsers_percentage_list = pd.read_csv("../datasets/palin_endorsers_hours_percentage_list.csv", dtype=int, header=None)
endorsers_percentage_list = endorsers_percentage_list.values.tolist()
endorsers_percentage_list = [item for sublist in endorsers_percentage_list for item in sublist]

In [5]:
print(len(endorsers_percentage_list))

3634


In [6]:
# Tolgo 1 in quanto considero la prima epoca come lo stato iniziale
epochs = len(endorsers_percentage_list) - 1

In [7]:
print(len(dataset_palin))

4423


In [8]:
n_nodes = 3181
n_nodes

3181

In [9]:
n_bots = round((2 / 100) * n_nodes)
n_bots

64

In [10]:
n_fact_checkers = round((2 / 100) * n_nodes)
n_fact_checkers

64

In [11]:
n_influencers = 25

In [12]:
n_commons = n_nodes - n_bots - n_fact_checkers - n_influencers
n_commons

3028

In [13]:
n_initial_infected_nodes = round((3 / 100) * n_nodes) - n_bots
n_initial_infected_nodes

31

In [14]:
df = pd.DataFrame(columns=['alpha','beta','homophily','prob_infection','prob_vaccination',
                           'prob_cure','prob_influencer','prob_echo','rmse'])

In [15]:
search_spaces = {
    'prob_influencer': (0.0, 1.0),
    'prob_infection': (0.0, 1.0),
    'prob_vaccination': (0.0, 1.0),
    'prob_cure': (0.0, 1.0),
    'prob_echo': (0.0, 1.0),
}

best_params = []
best_rmse = 1000

In [16]:
# Funzione per salvare i migliori parametri e il miglior RMSE a ogni passo di inferenza
def on_step_callback(result):
    global best_rmse, best_params
    length = len(result.func_vals)
    last_val = result.func_vals[length-1]
    if last_val < best_rmse:
        best_rmse = last_val
        best_params = result.x

In [17]:
# Funzione per verificare se i valori di alpha e beta sono corretti
def check_values(alpha, beta):
    if  alpha + beta > 1.0:
        return False
    # Con alpha = 0 e beta = 1 non vengono aggiunti nuovi nodi
    if alpha == 0 and beta == 1:
        return False
    return True

In [18]:
def run_simulations(alpha, beta, homophily):
    container = NetworksContainer(n_networks=3, n_nodes=n_nodes, 
                                  alpha=alpha, beta=beta, 
                                  delta_in=1, delta_out=1, 
                                  homophily=homophily, n_commons=n_commons, 
                                  n_influencers=n_influencers, n_bots=n_bots, 
                                  n_fact_checkers=n_fact_checkers,
                                  prob_complaint=0.1, 
                                  prob_infection=0.5, 
                                  prob_vaccination=0.1,
                                  prob_cure=0.1, 
                                  prob_influencer=0.1, 
                                  exp_decay=True, user_block=False, 
                                  prob_echo=0.0, epochs=epochs)
    container.create_network_list(n_initial_infected_nodes=n_initial_infected_nodes)
    estimator = Estimator()
    clf = BayesSearchCV(estimator, 
                        search_spaces=search_spaces,
                        cv=[(slice(None), slice(None))], # per evitare la cross-validation
                        verbose=0,
                        n_iter=8)
    clf.fit(np.array([container]), 
            np.array([endorsers_percentage_list]),
            callback=on_step_callback)
    best_prob_cure = best_params[0]
    best_prob_echo = best_params[1]
    best_prob_influencer = best_params[2]
    best_prob_infection = best_params[3]
    best_prob_vaccination = best_params[4]
    
    return best_prob_infection, best_prob_vaccination, best_prob_cure, best_prob_influencer, best_prob_echo

In [19]:
values_alpha_beta = [0.0, 0.25, 0.33, 0.50, 0.75, 1.0]

In [20]:
for i in tqdm.tqdm((range(len(values_alpha_beta)))):
    alpha = values_alpha_beta[i]
    for j in tqdm.tqdm(range(len(values_alpha_beta))):
        beta = values_alpha_beta[j]
        checked_values = check_values(alpha, beta)
        if checked_values:
            homophily = 0.25
            prob_infection, prob_vaccination, prob_cure, prob_influencer, prob_echo = run_simulations(alpha, beta, homophily)
            row = {'alpha':alpha,'beta':beta,
                    'homophily':homophily,
                    'prob_infection':prob_infection,
                    'prob_vaccination':prob_vaccination,
                    'prob_cure':prob_cure,
                    'prob_influencer':prob_influencer,
                    'prob_echo':prob_echo,
                    'rmse':best_rmse}
            row = pd.DataFrame([row])
            df = pd.concat([df, row], ignore_index=True)
            df.to_csv("parameters_optimization_palin_hours_homophily_0.25.csv", index=False)
            best_params = []
            best_rmse = 1000

  0%|          | 0/6 [00:00<?, ?it/s]

100%|██████████| 6/6 [2:39:57<00:00, 1599.51s/it]
100%|██████████| 6/6 [2:38:52<00:00, 1588.82s/it]it]
100%|██████████| 6/6 [2:02:26<00:00, 1224.42s/it]  ]
 50%|█████     | 3/6 [1:34:09<1:34:09, 1883.02s/it] 
 50%|█████     | 3/6 [8:55:25<8:55:25, 10708.52s/it]


KeyboardInterrupt: 