### Importación de todas la librerias utilizadas.

In [1]:
import sys
import pandas as pd
import BayesLibUtils as blU
import ModelosLib as modelos

from sklearn.preprocessing import KBinsDiscretizer

#Flag que permite visualizar o no las salidas de cada operación
outInfo = False

# Permite elegir con que modelo implementado realizar la Red Bayesiana
# "Python" == 0
# "R" == 1
flagModelo = 1

#Este flag solo está diseñado para operar cuando el valor "flagModelo == 1"
#True = permite usar el modelo con todas las variables discretas
#False = permite usar el modelo con variables mixtas, es decir, discretas y continuas
discreta = True 

#Este flag solo está diseñado para operar cuando el valor "flagModelo == 1"
#Permite indicar al aprendizaje de parametros de "R" que algoritmo de puntuación utilizar
# Algoritmos de puntuación disponibles:
#   1. score = "aic"       -> Solo para variables discretas, es decir, cuando flag "discreta = True"
#   2. score = "bic"       -> Solo para variables discretas, es decir, cuando flag "discreta = True"
#   3. score = "loglik"    -> Solo para variables discretas, es decir, cuando flag "discreta = True"
#   4. score = "aic-cg"    -> Solo para variables mixtas, es decir, cuando flag "discreta = False"
#   5. score = "bic-cg"    -> Solo para variables mixtas, es decir, cuando flag "discreta = False"
#   6. score = "loglik-cg" -> Solo para variables mixtas, es decir, cuando flag "discreta = False"
score = "aic"

#Este flag permite usar la base de datos balanceada o no 
#balanceado == True  -> usar datos balanceados
#balanceado == False -> usar datos desbalanceados
balanceado = True

### Validación de parámetros para el funcionamiento del modelo

In [2]:
if flagModelo == 1:
    print("Validando parámetros para ejecución de modelo en R")
    if (discreta == True) & ((score == "aic") | (score == "bic") | (score == "loglik")):
        print("Parámetros del modelo OK")
    elif (discreta == False) & ((score == "aic-cg") | (score == "bic-cg") | (score == "loglik-cg")):
        print("Parámetros del modelo OK")
    else:
        sys.exit("Parámetros del modelo incorrectos, corrija el problema y vuelva a ejecutar")
else:
    print("Ejecutando el modelo en Python")

Validando parámetros para ejecución de modelo en R
Parámetros del modelo OK


### Abriendo el conjunto de datos desde un archivo CSV y asignandolo al objeto DataFrame "df"

In [3]:
df = pd.read_csv('dataset_a.csv', sep=';', error_bad_lines=False)

## Transformación de variables cualitativas a continuas

### Etapas de la transformación

1. Se realiza la normalización de la variable "programa", que corresponde a una variable del tipo cualitativa/nominal.  También, se realiza la normalización de la variables "estado", que corresponde a una variable del tipo cualitativa/binaria.

2. Se realiza el reemplazo de los datos de las variables "programa" y "estado", utilizando la nornalización obtenida en la etapa anterior.  Con este reemplazo de datos las variables pasan a ser cuantitativas/continuas

3. Una vez que las variables mutaron, los datos quedan como tipo Object, por lo tanto se realiza un cast a int32 para ser usados.

In [4]:
#Etapa 1
normalizePrograma = df['programa'].value_counts(normalize=True, ascending=True)*100
if outInfo == True: 
    print('Pre-Normalización de variable "programa"')
    print('=======================================')
    print(normalizePrograma)
    print('\n')

normalizeEstado = df['estado'].value_counts(normalize=True, ascending=False)*100
if outInfo == True: 
    print('Pre-Normalización de variable "estado"')
    print('=====================================')
    print(normalizeEstado)
    print('\n')

#Etapa 2
for i in range(len(normalizePrograma)):
    df["programa"] = df["programa"].str.replace(normalizePrograma.index[i], str(i+1))

for i in range(len(normalizeEstado)):
    df["estado"] = df["estado"].str.replace(normalizeEstado.index[i], str(i))    

#Etapa 3
df['programa'] = df['programa'].astype('int')
df['estado'] = df['estado'].astype('int')

#Verificando que la transformación no haya afectado la normalización original
if outInfo == True:
    normalizePrograma = df['programa'].value_counts(normalize=True, ascending=True)*100
    print('Post-Normalización de variable "programa"')
    print('========================================')
    print(normalizePrograma)
    print('\n')

    normalizeEstado = df['estado'].value_counts(normalize=True, ascending=False)*100
    print('Post-Normalización de variable "estado"')
    print('======================================')
    print(normalizeEstado)

## Limpieza de variables

Se limpia el dataframe dejando solo las variables relevantes que, dado un estudio previo de la información, se determinó que eran relevantes y entregan un aporte real al modelo.

In [5]:
df = df.loc[:, ['lt', 'tt', 'pt', 'game_score', 
                'op1','op2', 'op3', 'op4', 'op5', 'op6',
                'sv1','sv2','sv3','sv4','sv5','sv6',
                'score', 'score_a', 'score_p', 'score_d', 'score_s',
                'programa', 'sol1', 'estado']]

if outInfo == True: print(df.columns)

## Discretizando variables cualitativas

Proporciona una forma de dividir características continuas en valores discretos.  La estrategia utilizada "kmeans" 
permite elegir los "centroides" por cada "bins" establecido asignando cada muestra a su centroide más cercano, repitiendo el proceso hasta llegar a un umbral, es decir, se repite hasta que los centroides no se mueven significativamente.

En esta etapa se discretizaron las variables "lt", "tt", "pt", "game_score" utilizando 5 grupos de intervalos regulares o "bins"

https://scikit-learn.org/stable/modules/clustering.html#k-means

In [6]:
if outInfo == True: 
    print('Pre-Discretización de variables')
    print('===============================')
    print(df.loc[:, ['lt', 'tt', 'pt', 'game_score', 'sol1']])
    print("\n")

# Obteniendo un "bins" optimo
#bins = int(round(blU.freedman_diaconis(df["lt"], returnas="width")))
bins = blU.bayesBlock(df["lt"])
if outInfo == True:    
    print("variable lt")
    print("===========")
    print("min : "+str(df["lt"].min()))
    print("max : "+str(df["lt"].max()))
    print("sum : "+str(df["lt"].sum()))
    print("bins: "+str(bins))
    print("\n")
valorDiscreto = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy = "kmeans").fit_transform(df[['lt']])
df[['lt']] = valorDiscreto

#bins = int(round(blU.freedman_diaconis(df["tt"], returnas="width")))
bins = blU.bayesBlock(df["tt"])
if outInfo == True: 
    print("variable tt")
    print("===========")
    print("min : "+str(df["tt"].min()))
    print("max : "+str(df["tt"].max()))
    print("sum : "+str(df["tt"].sum()))
    print("bins: "+str(bins))
    print("\n")
valorDiscreto = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy = "kmeans").fit_transform(df[['tt']])
df[['tt']] = valorDiscreto

#bins = int(round(blU.freedman_diaconis(df["pt"], returnas="width")))
bins = blU.bayesBlock(df["pt"])
if outInfo == True: 
    print("variable pt")
    print("===========")
    print("min : "+str(df["pt"].min()))
    print("max : "+str(df["pt"].max()))
    print("sum : "+str(df["pt"].sum()))
    print("bins: "+str(bins))
    print("\n")
valorDiscreto =  KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy = "kmeans").fit_transform(df[['pt']])
df[['pt']] = valorDiscreto

#bins = int(round(blU.freedman_diaconis(df["game_score"], returnas="width")))
bins = blU.bayesBlock(df["game_score"])
if outInfo == True: 
    print("variable game_score")
    print("===========")
    print("min : "+str(df["game_score"].min()))
    print("max : "+str(df["game_score"].max()))
    print("sum : "+str(df["game_score"].sum()))
    print("bins: "+str(bins))
    print("\n")
valorDiscreto =  KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy = "kmeans").fit_transform(df[['game_score']])
df[['game_score']] = valorDiscreto

#bins = int(round(blU.freedman_diaconis(df["game_score"], returnas="width")))
bins = blU.bayesBlock(df["sol1"])
if outInfo == True: 
    print("variable sol1")
    print("===========")
    print("min : "+str(df["sol1"].min()))
    print("max : "+str(df["sol1"].max()))
    print("sum : "+str(df["sol1"].sum()))
    print("bins: "+str(bins))
    print("\n")
valorDiscreto =  KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy = "kmeans").fit_transform(df[['sol1']])
df[['sol1']] = valorDiscreto

if outInfo == True: 
    print('Post-Discretización de variables')
    print('===============================')
    print(df.loc[:, ['lt', 'tt', 'pt', 'game_score', 'sol1']])

### Algoritmo que permite obtener la causalidad Bayesiana

In [7]:
if flagModelo == 0:
    modelos.modeloPython (df, 'estado', 5)
    
if flagModelo == 1:   
    modelos.modeloR(df, 'estado', 5, discreta, score, balanceado)

INICIO DE SECCION DE ENTRENAMIENTO, FOLD:  1
Balanceando porción de entrenamiento
FILA N°: 1 -> P(estado == "1") | (lt = "0",tt = "2",pt = "1",game_score = "1",op1 = "0",op2 = "1",op3 = "1",op4 = "1",op5 = "1",op6 = "1",sv1 = "1",sv2 = "1",sv3 = "1",sv4 = "1",sv5 = "1",sv6 = "1",score = "9",score_a = "6",score_p = "4",score_d = "3",score_s = "3",programa = "4",sol1 = "0")

A: 0.0
R: 1.0

FILA N°: 2 -> P(estado == "1") | (lt = "0",tt = "1",pt = "2",game_score = "1",op1 = "1",op2 = "0",op3 = "1",op4 = "1",op5 = "0",op6 = "0",sv1 = "1",sv2 = "1",sv3 = "1",sv4 = "1",sv5 = "1",sv6 = "1",score = "5",score_a = "4",score_p = "2",score_d = "2",score_s = "1",programa = "5",sol1 = "0")

A: 0.0
R: 1.0

FILA N°: 3 -> P(estado == "1") | (lt = "1",tt = "1",pt = "1",game_score = "1",op1 = "1",op2 = "1",op3 = "0",op4 = "1",op5 = "0",op6 = "1",sv1 = "1",sv2 = "1",sv3 = "0",sv4 = "1",sv5 = "1",sv6 = "1",score = "8",score_a = "5",score_p = "4",score_d = "2",score_s = "2",programa = "2",sol1 = "0")

A: 0.0

R[write to console]: Error in check.evidence(evidence, fitted) : 
  the evidence for node 'sol1' must be valid levels.

R[write to console]: Error in check.evidence(evidence, fitted) : 
  the evidence for node 'sol1' must be valid levels.



A: nan
R: nan

FILA N°: 3 -> P(estado == "1") | (lt = "3",tt = "2",pt = "0",game_score = "2",op1 = "1",op2 = "0",op3 = "0",op4 = "0",op5 = "0",op6 = "0",sv1 = "1",sv2 = "1",sv3 = "0",sv4 = "0",sv5 = "0",sv6 = "1",score = "7",score_a = "5",score_p = "2",score_d = "3",score_s = "3",programa = "4",sol1 = "2")

A: 0
R: 0

FILA N°: 4 -> P(estado == "1") | (lt = "0",tt = "2",pt = "1",game_score = "1",op1 = "1",op2 = "1",op3 = "0",op4 = "1",op5 = "0",op6 = "1",sv1 = "1",sv2 = "1",sv3 = "0",sv4 = "1",sv5 = "1",sv6 = "1",score = "7",score_a = "4",score_p = "3",score_d = "2",score_s = "3",programa = "4",sol1 = "9")

A: 0.988169931262849
R: 0.009600053616674105

FILA N°: 5 -> P(estado == "1") | (lt = "0",tt = "1",pt = "1",game_score = "0",op1 = "0",op2 = "1",op3 = "1",op4 = "1",op5 = "1",op6 = "1",sv1 = "1",sv2 = "1",sv3 = "1",sv4 = "1",sv5 = "1",sv6 = "1",score = "6",score_a = "3",score_p = "4",score_d = "1",score_s = "2",programa = "3",sol1 = "5")

A: 0.6460675412525997
R: 0.34624667826263494



R[write to console]: Error in check.evidence(evidence, fitted) : 
  the evidence for node 'score_a' must be valid levels.

R[write to console]: Error in check.evidence(evidence, fitted) : 
  the evidence for node 'score_a' must be valid levels.



FILA N°: 50 -> P(estado == "0") | (lt = "0",tt = "2",pt = "0",game_score = "1",op1 = "1",op2 = "0",op3 = "0",op4 = "0",op5 = "1",op6 = "0",sv1 = "1",sv2 = "1",sv3 = "0",sv4 = "1",sv5 = "1",sv6 = "1",score = "9",score_a = "6",score_p = "4",score_d = "3",score_s = "2",programa = "5",sol1 = "9")

A: 1.0
R: 0.0

FILA N°: 51 -> P(estado == "0") | (lt = "0",tt = "0",pt = "2",game_score = "1",op1 = "0",op2 = "1",op3 = "1",op4 = "0",op5 = "0",op6 = "1",sv1 = "1",sv2 = "1",sv3 = "1",sv4 = "0",sv5 = "1",sv6 = "1",score = "6",score_a = "2",score_p = "4",score_d = "1",score_s = "2",programa = "5",sol1 = "8")

A: 1.0
R: 0.0

FILA N°: 52 -> P(estado == "0") | (lt = "1",tt = "1",pt = "2",game_score = "2",op1 = "1",op2 = "1",op3 = "0",op4 = "1",op5 = "0",op6 = "1",sv1 = "1",sv2 = "1",sv3 = "0",sv4 = "1",sv5 = "0",sv6 = "1",score = "4",score_a = "3",score_p = "1",score_d = "2",score_s = "1",programa = "4",sol1 = "8")

A: nan
R: nan

FILA N°: 53 -> P(estado == "0") | (lt = "0",tt = "1",pt = "1",game_sco

R[write to console]: Error in check.evidence(evidence, fitted) : 
  the evidence for node 'tt' must be valid levels.

R[write to console]: Error in check.evidence(evidence, fitted) : 
  the evidence for node 'tt' must be valid levels.



FILA N°: 17 -> P(estado == "0") | (lt = "6",tt = "5",pt = "6",game_score = "5",op1 = "0",op2 = "0",op3 = "0",op4 = "0",op5 = "0",op6 = "0",sv1 = "0",sv2 = "0",sv3 = "0",sv4 = "0",sv5 = "0",sv6 = "0",score = "7",score_a = "4",score_p = "2",score_d = "3",score_s = "3",programa = "5",sol1 = "6")

A: 0
R: 0

FILA N°: 18 -> P(estado == "0") | (lt = "0",tt = "1",pt = "0",game_score = "0",op1 = "0",op2 = "1",op3 = "1",op4 = "1",op5 = "0",op6 = "0",sv1 = "1",sv2 = "1",sv3 = "1",sv4 = "1",sv5 = "1",sv6 = "1",score = "8",score_a = "6",score_p = "3",score_d = "3",score_s = "3",programa = "5",sol1 = "6")

A: 0.975320140271887
R: 0.02498621629803919

FILA N°: 19 -> P(estado == "0") | (lt = "5",tt = "0",pt = "4",game_score = "4",op1 = "0",op2 = "1",op3 = "0",op4 = "0",op5 = "0",op6 = "0",sv1 = "1",sv2 = "1",sv3 = "0",sv4 = "0",sv5 = "0",sv6 = "0",score = "6",score_a = "2",score_p = "4",score_d = "1",score_s = "1",programa = "4",sol1 = "5")

A: nan
R: nan

FILA N°: 20 -> P(estado == "0") | (lt = "0",

R[write to console]: Error in check.evidence(evidence, fitted) : 
  the evidence for node 'tt' must be valid levels.

R[write to console]: Error in check.evidence(evidence, fitted) : 
  the evidence for node 'tt' must be valid levels.



A: 0
R: 0

FILA N°: 68 -> P(estado == "0") | (lt = "1",tt = "0",pt = "1",game_score = "1",op1 = "0",op2 = "0",op3 = "1",op4 = "0",op5 = "1",op6 = "1",sv1 = "1",sv2 = "1",sv3 = "1",sv4 = "1",sv5 = "1",sv6 = "1",score = "9",score_a = "7",score_p = "3",score_d = "4",score_s = "3",programa = "5",sol1 = "9")

A: 1.0
R: 0.0

FILA N°: 69 -> P(estado == "0") | (lt = "2",tt = "2",pt = "2",game_score = "2",op1 = "1",op2 = "1",op3 = "0",op4 = "0",op5 = "0",op6 = "1",sv1 = "1",sv2 = "1",sv3 = "0",sv4 = "0",sv5 = "1",sv6 = "1",score = "4",score_a = "3",score_p = "2",score_d = "0",score_s = "1",programa = "5",sol1 = "9")

A: 0.9999219924304714
R: 5.3638994370082995e-05

FILA N°: 70 -> P(estado == "0") | (lt = "2",tt = "0",pt = "1",game_score = "1",op1 = "0",op2 = "0",op3 = "0",op4 = "0",op5 = "0",op6 = "0",sv1 = "1",sv2 = "1",sv3 = "0",sv4 = "1",sv5 = "1",sv6 = "0",score = "10",score_a = "7",score_p = "4",score_d = "3",score_s = "3",programa = "4",sol1 = "9")

A: 1.0
R: 0.0

FILA N°: 71 -> P(estado 