# 5. Ensambling de los diferentes algoritmos

###### En este notebook, a partir de los resultados obtenidos por los diferentes algoritmos, haciendo uso de los que mayor porcentaje han tenido se generará ensambling entre estos según el peso indicado.

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

## Inicializacion variables

In [None]:
dfLR = None
dfLSVC = None
dfNB = None
dfSGD = None
dfDT = None

#Deep Learning
dfCNN = None
dfLSTM = None

> Lectura de todos los ficheros a tratar, se requerirá de un archivo CSV con todos los campos igual que un te test para en caso que un df no tenga valores por que sea comentado este será utilizado sin peso alguno en los resultados finales 

In [None]:
path = '../data/processed/ensambling/'

# Se necesita de un archivo csv a modo de ejemplo para modificarlo totalmente, su contenido no será usado para nada.
sample = pd.read_csv(path + 'novalidSample.csv', encoding='utf-8')


#Machine learning
dfLR = pd.read_csv(path + 'lr.csv', encoding='utf-8')
dfLSVC = pd.read_csv(path + 'lsvc.csv', encoding='utf-8')
dfNB = pd.read_csv(path + 'nb.csv', encoding='utf-8')
dfSGD = pd.read_csv(path + 'sgd.csv', encoding='utf-8')
dfDT = pd.read_csv(path + 'dt.csv', encoding='utf-8')

#Deep Learning
dfCNN = pd.read_csv(path + 'cnn.csv', encoding='utf-8')
dfLSTM = pd.read_csv(path + 'lstm.csv', encoding='utf-8')

In [None]:
columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

## Pesos para los diferentes algoritmos

In [None]:
wLR = 0.50
wLSVC = 0.0
wNB = 0.0
wSGD = 0.0
wDT = 0.0

wCNN = 0.50
wLSTM = 0.0

In [None]:
if dfLR is None:
    dfLR = sample
    wLR = 0.0
if dfLSVC is None:
    dfLSVC = sample
    wLSVC = 0.0
if dfNB is None:
    dfNB = sample
    wNB = 0.0
if dfSGD is None:
    dfSGD = sample
    wSGD = 0.0
if dfDT is None:
    dfDT = sample
    wDT = 0.0
if dfCNN is None:
    dfCNN = sample
    wCNN = 0.0
if dfLSTM is None:
    dfLSTM = sample
    wLSTM = 0.0

wTotal = sum([wLR, wLSTM, wNB, wSGD, wDT, wCNN, wLSTM])
if wTotal != 1.0:
    print("El peso total no es de 1. Es de --> " + str(wTotal))
else:
    print("El peso es correcto")

In [None]:
dfEnd = sample.copy()
dfEnd[columns] = (dfLR[columns] * wLR 
                  + dfLSVC[columns] * wLSVC 
                  + dfNB[columns] * wNB
                  + dfSGD[columns] * wSGD
                  + dfDT[columns] * wDT
                  + dfCNN[columns] * wCNN
                  + dfLSTM[columns] * wLSTM
                 )

> Exportación final del resputado

In [None]:
name = "ensambled"
dfEnd.to_csv(path + 'ensambled/' + name + '.csv', index=False)

### Ensambling por correlación de las predicciones

In [None]:
coefDensidad = 0.1
maxCorrelacion = 0.98

# maxCorrelacion y coefDensidad Nunca puede ser superior a 1 o inferior a 0
assert maxCorrelacion >= 0.0 and maxCorrelacion <= 1.0
assert coefDensidad >= 0.0 and coefDensidad <= 1.0


In [None]:
def loadPredictions():
    '''
    Carga de todas las predicciones
    '''
    files = os.listdir(path)
    lstCSVs = []
    for f in files:
        if f.endswith(".csv"):
            lstCSVs.append(f)
    frames = {f:pd.read_csv(path+f).sort_values('id') for f in lstCSVs}
    return frames


def getCorrelationMatrix(col,frames):
    '''
    Se obtiene la matriz de correlación
    '''
    corDf = pd.DataFrame()
    for name, df in frames.items():
        corDf[name] = df[col]
    cor = corDf.corr()
    for name in cor.columns:
        cor.set_value(name,name,0.0)
    return cor


def getMaxCorrelation(matrixCorr):
    '''
    Se obtiene la correlación maxima de la matriz
    '''
    nCor = np.array(matrixCorr.values)
    corr = np.max(nCor)
    idx = np.unravel_index(np.argmax(nCor, axis=None), nCor.shape)
    x1 = matrixCorr.columns[idx[0]]
    x2 = matrixCorr.columns[idx[1]]
    return corr,x1,x2


def mergeDensitiesWeights(m1,m2,densities):
    '''
    Union de las densidades seún el peso y el coeficiente de densidad.
    '''
    d1 = densities[m1]
    d2 = densities[m2]
    d_tot = d1 + d2
    weights1 = 0.5*coefDensidad + (d1/d_tot)*(1-coefDensidad)
    weights2 = 0.5*coefDensidad + (d2/d_tot)*(1-coefDensidad)
    return weights1, weights2


def ensambleByColumn(col,frames,densities):
    '''
    Union de cada una 
    '''
    if len(frames) == 1:
        _, fr = frames.popitem()
        return fr[col]

    mat = getCorrelationMatrix(col,frames)
    corr,merge1,merge2 = getMaxCorrelation(mat)
    new_col_name = merge1 + '_' + merge2

    w1,w2 = mergeDensitiesWeights(merge1,merge2,densities)
    new_df = pd.DataFrame()
    new_df[col] = (frames[merge1][col]*w1) + (frames[merge2][col]*w2)
    del frames[merge1]
    del frames[merge2]
    frames[new_col_name] = new_df

    if corr >= maxCorrelacion:
        print('\t',merge1,merge2,' (OVER CORR)')
        densities[new_col_name] = max(densities[merge1],densities[merge2])
    else:
        print('\t',merge1,merge2)
        densities[new_col_name] = densities[merge1] + densities[merge2]

    del densities[merge1]
    del densities[merge2]
    #print(densities)
    return ensambleByColumn(col,frames,densities)


ensambled = sample.sort_values('id')

for col in tqdm(columns):
    frames = loadPredictions()
    print('\n\n',col)
    densities = {k:1.0 for k in frames.keys()}
    ensambled[col] = ensambleByColumn(col,frames,densities)

ensambled.to_csv(path + 'ensambled/ensambledByCorrelations.csv', index=False)