In [4]:
import random
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.preprocessing import StandardScaler

In [5]:
data = pd.read_csv('Datos_completos.csv')
empresas = data['NIT'].unique()

In [6]:
### Cantidad de empresas
len(empresas)

33

In [9]:
def diff_media(data, empresas1, empresas2):
    
    scaler = StandardScaler()
    scaler.fit(data.drop(columns = ['NIT','Year']))
    
    data_empresa1 = scaler.transform(data[data['NIT'].isin(empresas1)].drop(columns = ['NIT','Year'])) 
    data_empresa2 = scaler.transform(data[data['NIT'].isin(empresas2)].drop(columns = ['NIT','Year']))
  
    mean_empresa1 = np.mean(data_empresa1, axis = 0, keepdims=True)
    mean_empresa2 = np.mean(data_empresa2, axis = 0, keepdims=True)
    
    norm_dif = np.linalg.norm(mean_empresa1-mean_empresa2,  ord = 2 )
    
    return norm_dif

In [20]:
%%time

np.random.seed(42)
best_norm = np.inf
best_empresa_comb = None
lim = 25000
cont = 0
for p in combinations(empresas, 22):
    
    emp1 = [emp for emp in empresas if emp in p]
    emp2 = [emp for emp in empresas if not emp in p]
    
    norm_value = diff_media(data, emp1, emp2)
    
    if norm_value < best_norm:
        print(norm_value, cont)
        best_empresa_com = (emp1,emp2)
        best_norm = norm_value
    
    cont += 1
    if cont == lim:
        break

0.7524247131773183 0
0.6026785273047448 2
0.5308748373922801 11
0.521793226443431 41
0.49113344704939704 49
0.4797184437104368 115
0.43732363381613154 251
0.4274903204860129 391
0.3843035611474173 401
0.37004716424802814 537
0.36389454090224005 542
0.3608021673553538 590
0.3481642282591769 757
0.3379794012417565 810
0.3320549602998513 1179
0.3182847668174496 1198
0.31682615830286504 1201
0.28173750480595394 2914
0.2414346026674287 4203
0.19260511462028024 5917
0.15002449965494935 7206
0.14282382992678203 11287
0.13773476960242892 23508
0.13018024420622315 24300
0.09521724933277259 716892
Wall time: 1h 58s


In [21]:
### Mejor combinacion de empresas
emp1

[800015615,
 800045720,
 800081030,
 800112440,
 800118660,
 800157469,
 800232356,
 800236890,
 801002644,
 805012368,
 830030574,
 860009694,
 860030360,
 860050956,
 890117431,
 890311366,
 890904459,
 900173460,
 900184722,
 900234565,
 900364670,
 900389088]

In [22]:
emp2

[806014553,
 830037495,
 830052054,
 860033653,
 860501682,
 890300012,
 890909034,
 890929951,
 900204182,
 900378893,
 900437650]