In [1]:
import random
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.preprocessing import StandardScaler

In [5]:
data = pd.read_csv('Datos_completos.csv')
data = data.drop(columns = ['Costo de ventas','Gastos de ventas'])
empresas = data['NIT'].unique()

In [6]:
### Cantidad de empresas
len(empresas)

33

In [6]:
def diff_media(data, empresas1, empresas2):
    
    scaler = StandardScaler()
    scaler.fit(data.drop(columns = ['NIT','Year']))
    
    data_empresa1 = scaler.transform(data[data['NIT'].isin(empresas1)].drop(columns = ['NIT','Year'])) 
    data_empresa2 = scaler.transform(data[data['NIT'].isin(empresas2)].drop(columns = ['NIT','Year']))
  
    mean_empresa1 = np.mean(data_empresa1, axis = 0, keepdims=True)
    mean_empresa2 = np.mean(data_empresa2, axis = 0, keepdims=True)
    
    norm_dif = np.linalg.norm(mean_empresa1-mean_empresa2,  ord = 2 )
    
    return norm_dif

In [7]:
%%time

np.random.seed(42)
best_norm = np.inf
best_empresa_comb = None
lim = 25000
cont = 0
for p in combinations(empresas, 22):
    
    emp1 = [emp for emp in empresas if emp in p]
    emp2 = [emp for emp in empresas if not emp in p]
    
    norm_value = diff_media(data, emp1, emp2)
    
    if norm_value < best_norm:
        print(norm_value, cont)
        best_empresa_com = (emp1,emp2)
        best_norm = norm_value
    
    cont += 1
    if cont == lim:
        break

0.4991720227046567 0
0.25796725839653883 2
0.1917444339373911 34
0.17387259040662428 41
0.11394047341459353 100
0.056198358893702435 255
0.03671743927293176 541
0.036606140686489645 665
0.016988808836539027 1158
0.01268843986451623 7121
0.011849296980295446 11202
0.007085957868787957 11669
0.006209847858171331 22674
Wall time: 5min 41s


In [9]:
### Mejor combinacion de empresas
emp1

[800015615,
 800045720,
 800081030,
 800112440,
 800118660,
 800157469,
 800232356,
 800236890,
 801002644,
 805012368,
 806014553,
 830030574,
 830037495,
 830052054,
 860009694,
 860050956,
 890904459,
 890909034,
 900173460,
 900204182,
 900234565,
 900378893]

In [22]:
emp2

[806014553,
 830037495,
 830052054,
 860033653,
 860501682,
 890300012,
 890909034,
 890929951,
 900204182,
 900378893,
 900437650]