In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from flask import Flask, render_template
from sklearn.datasets import make_blobs 
from mpl_toolkits.mplot3d import Axes3D
import random
from itertools import groupby
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import seaborn as sns
import itertools
from functools import reduce
from itertools import product
from itertools import combinations
import scipy.stats as stats
import pickle
import os
import time
import linucb
import json
from kmodes.kmodes import KModes





In [3]:
def choix_compo_features(nb_feature,nb_min_categorie,nb_max_categorie):
    
    dict_composition_features = dict() # contient le nombre de catégorie pour chaque feature 
    columns = []
    nombre_individu_distinct = 1
    for i in range(nb_feature):
        feature = 'feature'+str(i)
        dict_composition_features[feature] = random.randint(nb_min_categorie, nb_max_categorie)
        columns.append(feature)
        nombre_individu_distinct = nombre_individu_distinct * dict_composition_features[feature]

    return dict_composition_features , nombre_individu_distinct

def generer_des_groupes(dict_composition_features,nb_groupes):
    groupes = []
    for i in range(nb_groupes):
        groupe_i = []
        for feature in dict_composition_features:
            max = dict_composition_features[feature]
            nb = random.randint(1,max)
            groupe_i.append(random.sample(range(1,max+1), nb))
        groupes.append(groupe_i)
    return groupes

def nb_possibilite(groupes):
    return [reduce((lambda x, y: x * y), list(map((lambda x: len(x) ), g))) for g in groupes]

def distance(x,y):
    set1 = set(x)
    set2 = set(y)
    similarity = len(set1.intersection(set2)) / len(set1.union(set2))
    return 1 - similarity

def distance_groupes(x,y):
    l = [distance(x[i],y[i]) for i in range(len(x)) ]
    return sum(l) / len(l)

def distance_moyenne_entre_paires_groupes(groupes):
    cmpt = 0
    summ = 0
    list_dist = []
    for pair in itertools.combinations(list(range(len(groupes))), 2):
        cmpt = cmpt + 1
        tmp = distance_groupes(groupes[pair[0]],groupes[pair[1]])
        summ = summ + tmp
        list_dist.append((pair[0],pair[1],tmp))
    return summ/cmpt , list_dist


def appartenance_groupe(individu,groupes):
    ind=0
    indices = []
    for g in groupes:
        test = True
        for i in range(len(g)):
            if individu[i] not in g[i]:
                test = False
                break
        if test == True:
            indices.append(ind)
        ind=ind+1
    return indices


def nombre_individu_par_groupes(df,groupes):
    cmpt_gr = [0] * len(groupes)
    intersection_cmpt = [0] * len(groupes)
    sans = 0
    for i in range(len(df)):
        individu = list(df.iloc[i])
        L = appartenance_groupe(individu,groupes)
        if L == [] :
            sans = sans + 1
        else :
            intersection_cmpt[len(L)-1] = intersection_cmpt[len(L)-1] + 1
        for l in L :
            cmpt_gr[l] = cmpt_gr[l] + 1
    return cmpt_gr , intersection_cmpt, sans

def predict_nombre_individu_par_groupes_sum(taille_df,nombre_individu_distinct,nb_possibilite):
    tmp = nb_possibilite / nombre_individu_distinct
    return tmp * taille_df


def choisir_alea(groupe,nb):
    max_unique_combi = nb_possibilite([groupe])[0]
    num_combos = min (max_unique_combi , nb)
    combos = []
    while len(combos) < num_combos :
        combo = []
        for g in groupe :
            combo.append(random.sample(g, 1)[0])
        if combo not in combos :
            combos.append(combo)
    return combos


def generer_all_unique_possi_df(groupe, id_groupe,taille):
    columns = ['feature'+str(f) for f in range(len(groupe))]
    combinations  = choisir_alea(groupe,taille)
    df = pd.DataFrame(combinations,columns=columns)
    df = df.assign(id_groupe = id_groupe)
    return df

def generer_all_unique_possi_df_for_groups(groupes,list_id,taille) :
    columns = ['feature'+str(f) for f in range(len(groupes[0]))]
    columns.append('id_groupe')
    df = pd.DataFrame(columns=columns)
    for i in list_id :
        tmp_df = generer_all_unique_possi_df(groupes[i], i,taille)
        df = pd.concat([df, tmp_df], axis=0)
    return df.reset_index().drop('index', axis=1)

def reblance_df(df,nb_groupes):
    # Et Avoir aussi au moins 10 rows, ca sera notre base pour entrainer le modèle 
    max_individu_grp = max(df['id_groupe'].value_counts())
    for id in range(nb_groupes) :
        nb_individu = (df['id_groupe']==id).sum()
        if(nb_individu<max_individu_grp):
            duppli =(max_individu_grp // nb_individu) - 1
            duppli_rand = max_individu_grp % nb_individu
            tmp_df = df[df['id_groupe']==id]
            if duppli > 0 :
                tmp_df = pd.concat([tmp_df] * duppli, ignore_index=True)
                tmp_df = pd.concat([tmp_df,tmp_df.sample(n=duppli_rand)], ignore_index=True)
            if duppli == 0:
                tmp_df = tmp_df.sample(n=duppli_rand)
            df = pd.concat([df,tmp_df], ignore_index=True)
    return df


def shape_train_df(df,nb_rows,len_group,ratio_balanced,dist='uniform'):
    distributions = {"expon": stats.expon(loc=0, scale=1),"poisson": stats.poisson(mu=1),"gamma": stats.gamma(a=1, loc=0, scale=1),"pareto": stats.pareto(b=1),"lognorm": stats.lognorm(s=1, loc=0, scale=1),"uniform": stats.uniform(loc=0, scale=1)}
    # Le ratio commun entre 0 et 1 est départagé equitablement entre les différents groupes puis le 1 - ratio_commun est departagé inequitablement pour le desequilibre
    if (ratio_balanced<0 or ratio_balanced>1 ):
        print("Ratio not between 0 and 1")
        return None 
    base_ratios = [ratio_balanced/len_group for i in range(len_group)]
    unblanced_ratio = 1 - ratio_balanced
    dist_ratios = distributions[dist].rvs(size=len_group)
    dist_ratios = dist_ratios / sum(dist_ratios)
    dist_ratios = sorted(dist_ratios, reverse=True)
    dist_ratios = [r * unblanced_ratio for r in dist_ratios]
    wanted_groups_number = [int((x + y)*nb_rows) for x, y in zip(base_ratios, dist_ratios)] 
    random.shuffle(wanted_groups_number) 
    actual_groups_number = df['id_groupe'].value_counts()[0] 

    final_df = pd.DataFrame(columns=df.columns)

    for id_g  in range(len_group):
        wanted = wanted_groups_number[id_g]
        tmp_df = df[df['id_groupe']==id_g]
        if(wanted <= actual_groups_number):
            final_df = pd.concat([final_df,tmp_df.sample(n=wanted)],ignore_index=True)
        else :
            duppli =(wanted // actual_groups_number)
            duppli_rand = wanted % actual_groups_number
            tmp_df = pd.concat([tmp_df] * duppli, ignore_index=True)
            tmp_df = pd.concat([tmp_df,tmp_df.sample(n=duppli_rand)], ignore_index=True)
            final_df = pd.concat([final_df,tmp_df], ignore_index=True)
    
    return final_df 


def compter_diversite(df):
    num_groupes = list(df['id_groupe'].unique())
    disp = []
    for n in num_groupes :
        tmp_df = df[df['id_groupe']==n]
        tmp_df = tmp_df[tmp_df.columns[:-1]]
        prop = tmp_df.duplicated().sum() * 100 / len(tmp_df)
        disp.append((n,str(int(prop))+'% duplique',len(tmp_df) ) )
    return disp

In [4]:
def reshape_dictionnaire_data(data):
    f=pd.DataFrame(data['other_info']['composition des features'],index=['nb of class'])
    ##################################################################################################
    ind = ['compo groupe '+str(i) for i in range(len(data['other_info']['composition des groupes']))]
    df = pd.DataFrame(data['other_info']['composition des groupes'],columns=f.columns,index =ind )
    ##################################################################################################
    general = pd.concat([f,df])
    general['nb disctint person possible'] = ['-']+data['other_info']['nombre possibilite individu dans les groupes']
    general['diversite des groupes'] = ['-']+data['other_info']['diversite des groupes']
    data['info_df'] = general
    tmp = data['other_info']
    del data['other_info']
    del tmp['composition des features']
    del tmp['composition des groupes']
    del tmp['nombre possibilite individu dans les groupes']
    del tmp['diversite des groupes'] 
    data['Other Information']= tmp
    return data

def ordonner(Datas):
    return sorted(Datas, key=lambda x: int(x[1][-2:])) # ATTENTION A CHANGER CAR MARCHE SI GROUPES SUR 2 CHIFFRE SINON ERREUR !!!!!

def generete_data(nombre_feature,nombre_div_cat_min,nombre_div_cat_max,nombre_groupe,nb_lignes,ratio_de_disparite_inter_group,dist):
    dict_info_data = {}
    ############################
    dict_info_data['nombre de ligne']=nb_lignes
    dict_info_data['ratio_de_disparite_inter_group'] = ratio_de_disparite_inter_group
    dict_info_data['Loi de distribution de dispartibe'] = dist
    ############################
    dict_composition_features , nombre_individu_distinct = choix_compo_features(nombre_feature,nombre_div_cat_min,nombre_div_cat_max)
    dict_info_data['nb de col avec one hot'] = sum(dict_composition_features.values())
    dict_info_data['indiviud distinct'] = nombre_individu_distinct
    dict_info_data['composition des features'] = dict_composition_features
    #print(nombre_individu_distinct)
    #print(sum(dict_composition_features.values()))
    ############################
    groupes = generer_des_groupes(dict_composition_features,nombre_groupe)
    dict_info_data['composition des groupes'] = groupes
    moy , list_dist = distance_moyenne_entre_paires_groupes(groupes)
    nb_possi = nb_possibilite(groupes)
    dict_info_data['distance moyenne entre paires groupes'] = moy
    dict_info_data['nombre possibilite individu dans les groupes'] = nb_possi
    dict_info_data['distance entre paires groupes '] = list_dist
    ############################
    test = True
    while test :
        df = generer_all_unique_possi_df_for_groups(groupes,list(range(len(groupes))),20000) # ->>>>> A VARIER SI BESOIN SI BCP BCP DE LIGNES
        df = df.sample(frac=1).reset_index(drop=True)
        df.drop_duplicates(subset=list(df.columns[:-1]), keep='last', inplace=True) # <----------- Enleve deux individu pareils pour ne pas etre dans 2 groupes differents
        # Car donne de mauvais resultats pour les algos voir si cas possible dans la realite , se produit quand très peu de nb individu discints 
        if len(df['id_groupe'].unique()) == len(groupes) : # ca veut dire un groupe inclu dans un autre et on veut pas ca , si egal ok
            test = False
            # Alors ok on peut passer étape suivante
    ############################
    df = reblance_df(df,len(groupes))
    final_df = shape_train_df(df,nb_lignes,len(groupes),ratio_de_disparite_inter_group,dist)
    dict_info_data['diversite des groupes'] = compter_diversite(final_df)
    ############################
    cmpt_gr , intersection_cmpt, sans= nombre_individu_par_groupes(final_df,groupes)
    dict_info_data['nb individu ds chaque groupes'] = cmpt_gr
    dict_info_data['nb de cross'] = intersection_cmpt
    ############################
    df_encoded = pd.get_dummies(final_df,columns=list(final_df.columns)[:-1]).astype(int)
    df_encoded['cluster']= df_encoded['id_groupe']
    df_encoded=df_encoded.drop('id_groupe',axis=1)
    df_encoded = df_encoded.sample(frac=1).reset_index(drop=True)
    dict_info_data['nombre d individu duplique en % dans final df'] =(df_encoded.duplicated().sum())*100/len(df_encoded)
        
    if  (df_encoded.shape[1] != sum(dict_composition_features.values()) + 1):
        print('Il y aura probleme de noramlisation dans méthode 2 dans dict_composition_features')

    return final_df , df_encoded , dict_info_data
    


In [5]:
nb_de_colonne = [5,10,15]
nb_categorie_feature =[5,12]
nombre_groupe = [10,30,50] 
avancement = len(nb_de_colonne)*len(nb_categorie_feature)*len(nombre_groupe)
chemin = '/Users/soufiane/Documents/GitHub/universe/My_universe/Data' # ATTENTION A CONFIGURER SELON ENVIRONEMENT

i=1
for c in nb_de_colonne:
    for f in nb_categorie_feature :
        for g in nombre_groupe:
            final_df , df_encoded , dict_info_data = generete_data(c,2,f,g,30000,1,'uniform')
            fichier_name = chemin+'DataSimuRl_c_'+str(c)+'_f_'+str(f)+'_g_'+str(g)
            data = {'nb_de_colonne':c,'nb_categorie_max_par_colonne':f,'nombre_groupe':g,'df': final_df, 'df_encoded':df_encoded,'other_info': dict_info_data}
            with open(fichier_name, 'wb') as fichier:
                pickle.dump(data, fichier)
            print(fichier_name,' | DONE ',' | Avancement : ',i,' sur ',avancement)
            i=i+1



"nb_de_colonne = [5,10,15]\nnb_categorie_feature =[5,12]\nnombre_groupe = [10,30,50] \navancement = len(nb_de_colonne)*len(nb_categorie_feature)*len(nombre_groupe)\nchemin = '/Users/soufiane/Documents/GitHub/universe/My_universe/Data' # ATTENTION A CONFIGURER SELON ENVIRONEMENT\n\ni=1\nfor c in nb_de_colonne:\n    for f in nb_categorie_feature :\n        for g in nombre_groupe:\n            final_df , df_encoded , dict_info_data = generete_data(c,2,f,g,30000,1,'uniform')\n            fichier_name = chemin+'DataSimuRl_c_'+str(c)+'_f_'+str(f)+'_g_'+str(g)\n            data = {'nb_de_colonne':c,'nb_categorie_max_par_colonne':f,'nombre_groupe':g,'df': final_df, 'df_encoded':df_encoded,'other_info': dict_info_data}\n            with open(fichier_name, 'wb') as fichier:\n                pickle.dump(data, fichier)\n            print(fichier_name,' | DONE ',' | Avancement : ',i,' sur ',avancement)\n            i=i+1"

In [6]:
def simulate_reward(cluster_id,arm_id,nb_arm):
    penality = 0
    if cluster_id%nb_arm == arm_id: # in this case we choosed the good arm but doesn't mean for sure postive reward but high proba
        p = 0.8 #+ random.uniform(-0.1, 0.2)
        return random.choices([penality,1], [1-p,p])[0]
    else :
        p = 0.1 #+ random.uniform(-0.05, 0.05)
        return random.choices([penality,1], [1-p,p])[0]
    

In [7]:
def simulate_reward_shift(cluster_id,arm_id,nb_gps,nb_arm):
    prefered_arm =  shift(cluster_id,nb_gps)
    penality = 0
    if prefered_arm%nb_arm == arm_id: # in this case we choosed the good arm but doesn't mean for sure postive reward but high proba
        p = 0.6 + random.uniform(-0.1, 0.2)
        return random.choices([penality,1], [1-p,p])[0]
    else :
        p = 0.1 + random.uniform(-0.05, 0.05)
        return random.choices([penality,1], [1-p,p])[0]
    

def shift(c_id,nb_gps):
    t = (nb_gps-1) - c_id
    if (t<0 or t>nb_gps-1):
        print("nb_gps-1 : ",nb_gps-1,"c_id",c_id," t:",t)
        print('erreur dans shift')
    return t


In [8]:
def initiate_ucb_ev_dict(listt):
    result = {}
    for i in listt :
        result[i]={'p':[],'theta':[],'Incert':[]}
    return result


def evolution_paramètre_arm(arm_id,listt,linucb_policy_object):
    arm_ucb_ev=linucb_policy_object.linucb_arms[arm_id].ucb_evo
    result=initiate_ucb_ev_dict( listt)
    for e in arm_ucb_ev:
        feature = e[0]
        result[feature]['p'].append(float(e[1]))
        result[feature]['theta'].append(float(e[2]))
        result[feature]['Incert'].append(float(e[3]))
    return result

# FOR EVOLUTION OF ARM : THETA , P and INCERTITUDE OF POLICY

def Evolution_Arm(arm,df_encoded,linucb_policy_object,fromm=0,to=-1):
    arm = 0
    fromm = 0
    to = -1
    listt = list(df_encoded['cluster'].value_counts().index)
    result=evolution_paramètre_arm(arm,listt,linucb_policy_object)
    for i in listt:
        plt.plot(result[i]['Incert'][fromm:to], label='theta of group'+str(i))
    plt.title("Evolution du Incert pour chaque feature de l'arm 0 ")
    plt.legend()
    plt.show()

    for i in listt:
        plt.plot(result[i]['theta'][fromm:to], label='theta of group'+str(i))
    plt.title("Evolution du theta pour chaque feature de l'arm 0 ")
    #plt.legend()
    plt.show()

    for i in listt:
        plt.plot(result[i]['p'][fromm:to], label='theta of group'+str(i))
    plt.title("Evolution du p pour chaque feature de l'arm 0 ")
    #plt.legend()
    plt.show()

# FOR EVOLUTION OF FEATURES ARM :  THETA , A and B FOR A POLICY

def Evoluion_feature_arm(arm,linucb_policy_object,fromm=0,to=-1):
    # Créer l'array de données
    arm = 0
    fromm = 0
    to = -1
    axe=0
    dataT = np.array(linucb_policy_object.linucb_arms[arm].theta_list[fromm:to])
    dataB = np.array(linucb_policy_object.linucb_arms[arm].b_list[fromm:to])
    dataA = np.array(linucb_policy_object.linucb_arms[arm].A_theta_list[fromm:to])

    # Afficher l'évolution de chaque point sur une courbe
    for i in range(dataT.shape[1]):
        plt.plot(dataT[:,i,:], label='feature {}'.format(i))

    plt.legend()
    plt.axvline(x=axe, color='black')
    plt.title("EVOLUTION DU THETA")
    plt.show()

    # Afficher l'évolution de chaque point sur une courbe
    for i in range(dataB.shape[1]):
        plt.plot(dataB[:,i,:], label='feature {}'.format(i))
    #----
    #plt.legend()
    #plt.axvline(x=axe, color='black')
    plt.title("EVOLUTION DU B")
    plt.show()

    # Afficher l'évolution de chaque point sur une courbe
    for i in range(dataA.shape[1]):
        plt.plot(dataA[:,i,:], label='feature {}'.format(i))
    #----
    #plt.legend()
    #plt.axvline(x=axe, color='black')
    plt.title("EVOLUTION DU A")
    plt.show()

In [9]:
def ctr_simulator_policies(df,df_clustered,nb_groups,policies,shift):
    # Instantiate trackers
    cumulative_rewards_b_a = []
    n_policies = len(policies)
    aligned_time_steps = np.zeros(n_policies)
    cumulative_rewards = np.zeros(n_policies)
    aligned_ctr = [[] for i in range(n_policies)]
    aligned_time = np.zeros(n_policies)
    # For updating arm indexes
    arm_index = np.zeros(n_policies)
    # For updating arm rewards
    data_reward = np.zeros(n_policies)
    
    # BEFORE SHIFT
    #print("Debut sans shift")
    for i in range(len(df)):

        # Selecting arm index et simulation reward and updating for each policies 
        for p in range(len(policies)):
            # We calculate the time for each iteration  
            start_time = time.time()
            ##########
            if p==2:
                # Recupere Data
                array = np.array(df.iloc[i])
                data_x_array = np.delete(array,-1) # enlève dernier élement 
                ######################
                arm_index[p] = random.randint(0, 9) # WE HAVE 10 ARMS 
                data_reward[p] = simulate_reward(array[-1],arm_index[p],10)
            else :
                if p == 3:
                    # Recupere Data
                    array = np.array(df_clustered.iloc[i])
                    data_x_array = np.delete(array,-1) # enlève dernier élement 
                else :
                    # Recupere Data
                    array = np.array(df.iloc[i])
                    data_x_array = np.delete(array,-1) # enlève dernier élement 

                #print(' p  : ',p,' data : ',data_x_array)
                arm_index[p] = policies[p].select_arm(data_x_array)
                data_reward[p] = simulate_reward(array[-1],arm_index[p],policies[p].K_arms)
                # Use reward information for the chosen arm to update
                tmp_arm = int(arm_index[p])
                policies[p].linucb_arms[tmp_arm].reward_update(data_reward[p], data_x_array)
                ##########
            end_time = time.time()
            elapsed_time = end_time - start_time
            aligned_time[p]  = aligned_time[p] + elapsed_time

            # For CTR calculation
            aligned_time_steps[p] +=1
            cumulative_rewards[p] += data_reward[p]
            aligned_ctr[p].append(cumulative_rewards[p]/aligned_time_steps[p])
        #if (i%1000==0):
            #print(str(i*100/len(df))+'%')
    #print("Fin sans shift ",cumulative_rewards)
    cumulative_rewards_b_a.append(cumulative_rewards)
    ################################################################################################################################################
    #print('Debut du Shift')
    cumulative_rewards = np.zeros(n_policies)
    aligned_time_steps = np.zeros(n_policies)
    # After SHIFT
    #print("Debut avec shift")
    for i in range(shift):
        
        # Selecting arm index et simulation reward and updating for each policies 
        for p in range(len(policies)):
            # We calculate the time for each iteration  
            start_time = time.time()
            ##########
            if p==2:
                # Recupere Data
                array = np.array(df.iloc[i])
                data_x_array = np.delete(array,-1) # enlève dernier élement 
                ######################
                arm_index[p] = random.randint(0, 9) # WE HAVE 10 ARMS 
                data_reward[p] = simulate_reward_shift(array[-1],arm_index[p],nb_groups,10)# WE HAVE 10 ARMS 
            else :
                if p == 3:
                    # Recupere Data
                    array = np.array(df_clustered.iloc[i])
                    data_x_array = np.delete(array,-1) # enlève dernier élement 
                else :
                    # Recupere Data
                    array = np.array(df.iloc[i])
                    data_x_array = np.delete(array,-1) # enlève dernier élement 

                arm_index[p] = policies[p].select_arm(data_x_array)
                data_reward[p] = simulate_reward_shift(array[-1],arm_index[p],nb_groups,policies[p].K_arms)
                # Use reward information for the chosen arm to update 
                tmp_arm = int(arm_index[p])
                policies[p].linucb_arms[tmp_arm].reward_update(data_reward[p], data_x_array)
            ##########
            end_time = time.time()
            elapsed_time = end_time - start_time
            aligned_time[p]  = aligned_time[p] + elapsed_time

            # For CTR calculation
            aligned_time_steps[p] +=1
            cumulative_rewards[p] += data_reward[p]
            aligned_ctr[p].append(cumulative_rewards[p]/aligned_time_steps[p])
        #if (i%1000==0):
            #print(str(i*100/shift)+'%')
    #print("Fin avec shift ",cumulative_rewards)
    cumulative_rewards_b_a.append(cumulative_rewards)

    return (cumulative_rewards_b_a, aligned_ctr ,aligned_time,policies)


def RunSimuOnData(Data,l,l_shift):
    df_encoded = Data['df_encoded']
    nb_groups = Data['nombre_groupe']
    df_clustered_encoded = Data['data_clustered_encoded'].sample(frac=1, axis=1)
    if l != -1 : # Sinon on parcout tout le data frame
        df_encoded = df_encoded.head(l)
    dict_info_data = dict(Data['info_df'].loc['nb of class'])
    del dict_info_data['nb disctint person possible']
    del dict_info_data['diversite des groupes']

    # Choice of policies 
    L_UCB = linucb.linucb_policy(K_arms = 10, d = df_encoded.shape[1]-1, alpha=1,version= -1,df_encoded=df_encoded)
    L_UCB_BIS = linucb.linucb_policy_bis(K_arms = 10, d = df_encoded.shape[1]-1, alpha=0.5,version= 1,compo_feature=dict_info_data,df_encoded=df_encoded)
    L_UCB_BIS_clustered = linucb.linucb_policy_bis(K_arms = 10, d = df_clustered_encoded.shape[1]-1, alpha=0.5,version= 1,compo_feature=-1,df_encoded=df_clustered_encoded)
    policies = [L_UCB,L_UCB_BIS,'Random',L_UCB_BIS_clustered]

    # Run the Simulation and recover data of simu
    cumulative_rewards, aligned_ctr ,aligned_time ,policies = ctr_simulator_policies(df_encoded,df_clustered_encoded,nb_groups,policies,l_shift)
    
    return cumulative_rewards, aligned_ctr ,aligned_time ,policies


def export_simu_info(cumulative_rewards, aligned_ctr ,aligned_time ,policies):
    info_simu = dict()
    ########
    info_simu['Aligned CTR'] = [(aligned_ctr[0],'Linear UCB'),(aligned_ctr[1],'Linear UCB BIS'),(aligned_ctr[2],'Random'),(aligned_ctr[3],'BIS Clustering')]
    ########
    df = pd.DataFrame(aligned_time,index=['Linear UCB Disjoint','Linear UCB Disjoint BIS','Random','BIS Clustering'],columns=["Execution time"]).T
    tmp = pd.DataFrame(cumulative_rewards[0],index=['Linear UCB Disjoint','Linear UCB Disjoint BIS','Random','BIS Clustering'],columns=["Cumulative Rewards Before Shift"]).T
    df = pd.concat([df , tmp])
    tmp = pd.DataFrame(cumulative_rewards[1],index=['Linear UCB Disjoint','Linear UCB Disjoint BIS','Random','BIS Clustering'],columns=["Cumulative Rewards After Shift"]).T
    df = pd.concat([df , tmp])
    tmp = pd.DataFrame(sum(cumulative_rewards),index=['Linear UCB Disjoint','Linear UCB Disjoint BIS','Random','BIS Clustering'],columns=["Cumulative Rewards Total"]).T
    df = pd.concat([df , tmp])
    info_simu['Information sur Reward et execution'] = df
    return info_simu

In [10]:
def ordonner(Datas):
    return sorted(Datas, key=lambda x: int(x[1].split('_')[2])) 

In [11]:
chemin = '/Users/soufiane/Documents/GitHub/universe/My_Universe/Data_with_clustering'
fichiers = os.listdir(chemin)
Datas = []
for nom_fichier in fichiers:
    with open(chemin+'/'+nom_fichier, 'rb') as f:
        data = pickle.load(f)
        #data = reshape_dictionnaire_data(data) , not use for Data_with_clustering , already reshaped
        Datas.append((data,nom_fichier))
Datas = ordonner(Datas)

In [13]:
chemin = '/Users/soufiane/Documents/GitHub/universe/My_universe/TT/'
i=1
for d in Datas[:5]:
    print('Debut de Simulation pour fichier : '+d[1])
    cumulative_rewards, aligned_ctr ,aligned_time ,policies = RunSimuOnData(d[0],-1,15000)
    info_simu = export_simu_info(cumulative_rewards, aligned_ctr ,aligned_time ,policies)
    d[0]['Inforamtion sur la Simulation'] = info_simu
    # On enregistre les données dans le répertoire
    print('Done now we dump')
    fichier_name = chemin + d[1]
    with open(fichier_name, 'wb') as fichier:
        try:
            pickle.dump(d[0], fichier)
            print('Fin de Simulation pour fichier : '+d[1],' | Avancement : ',i,' sur ',18)
            print('------------------------------------------------------------------------------------')
            i=i+1
        except Exception as error:
            print('Problem de dump')
            break
        


Debut de Simulation pour fichier : DataSimuRl_c_5_f_5_g_30
Done now we dump
Fin de Simulation pour fichier : DataSimuRl_c_5_f_5_g_30  | Avancement :  1  sur  18
------------------------------------------------------------------------------------
Debut de Simulation pour fichier : DataSimuRl_c_5_f_12_g_30
Done now we dump
Fin de Simulation pour fichier : DataSimuRl_c_5_f_12_g_30  | Avancement :  2  sur  18
------------------------------------------------------------------------------------
Debut de Simulation pour fichier : DataSimuRl_c_5_f_5_g_50
Done now we dump
Fin de Simulation pour fichier : DataSimuRl_c_5_f_5_g_50  | Avancement :  3  sur  18
------------------------------------------------------------------------------------
Debut de Simulation pour fichier : DataSimuRl_c_5_f_12_g_50
Done now we dump
Fin de Simulation pour fichier : DataSimuRl_c_5_f_12_g_50  | Avancement :  4  sur  18
------------------------------------------------------------------------------------
Debut de Sim

FOR CLUSTERING

In [None]:
def cluster_data(dt):
    data = dt['df']
    nb_group = dt['nombre_groupe']
    km = KModes(n_clusters=nb_group, init='Huang', n_init=5)
    km.fit(data.drop('id_groupe',axis=1)) # ICI ICI ICI ICI
    tmp_df = pd.DataFrame(pd.Series(km.labels_, name='Cluster'))
    data_clustered_encoded=pd.get_dummies(tmp_df,columns=['Cluster']).astype(int)
    data_clustered_encoded['id_groupe'] = data['id_groupe']
    count_cluster = pd.DataFrame(tmp_df['Cluster'].value_counts()).T
    return data_clustered_encoded , count_cluster

In [233]:
chemin = '/Users/soufiane/Documents/GitHub/universe/My_Universe/Data'
fichiers = os.listdir(chemin)
Datas = []
for nom_fichier in fichiers:
    with open(chemin+'/'+nom_fichier, 'rb') as f:
        data = pickle.load(f)
        data = reshape_dictionnaire_data(data)
        Datas.append((data,nom_fichier))
Datas = ordonner(Datas)

In [243]:
chemin = '/Users/soufiane/Documents/GitHub/universe/My_universe/Data_with_clustering/'
i=1
for d in Datas[15:]:
    print(d[1])
    data_clustered_encoded , count_cluster = cluster_data(d[0])
    d[0]['data_clustered_encoded'] = data_clustered_encoded
    d[0]['count_cluster'] = count_cluster
    # On enregistre les données dans le répertoire
    print('Done now we dump')
    fichier_name = chemin + d[1]
    with open(fichier_name, 'wb') as fichier:
        try:
            pickle.dump(d[0], fichier)
            print(i)
            i=i+1
        except Exception as error:
            print('Problem de dump')
            break

DataSimuRl_c_15_f_12_g_50
Done now we dump
1
DataSimuRl_c_15_f_5_g_30
Done now we dump
2
DataSimuRl_c_15_f_12_g_30
Done now we dump
3
