<a href="https://colab.research.google.com/github/simonezambonim/codenation_data_science_desafios/blob/master/Projeto_Final/main_Content_base_Recommender_Gower_coefficient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install gower

Collecting gower
  Downloading https://files.pythonhosted.org/packages/98/62/dea557ca74253ff35afa6dce17c6f950ff8b7fbd3636a4df2ef0877bcf65/gower-0.0.5.tar.gz
Building wheels for collected packages: gower
  Building wheel for gower (setup.py) ... [?25l[?25hdone
  Created wheel for gower: filename=gower-0.0.5-cp36-none-any.whl size=4232 sha256=63db4d684838b3884edd32dfd9fc74483d667280fa1fd954704ba0d430dccfff
  Stored in directory: /root/.cache/pip/wheels/c0/09/9b/072d54d6ced0f43a179852e3f09532d0131e25ff7cb4e5ee75
Successfully built gower
Installing collected packages: gower
Successfully installed gower-0.0.5


In [0]:
import pandas as pd
import gower
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split

sns.set()

# Utils functions

In [0]:
def get_info(df):
    '''
    Get basic info from data set
    '''
    print('Shape :: ', df.shape)
    return pd.DataFrame({'types': df.dtypes,
                            'nan': df.isna().sum(), 
                            'nan%': ((df.isna().sum()/len(df))*100).round(1), 
                            'unique':df.nunique()
                            })

In [0]:
def dropna_cols(df,info,max_na): 
    '''
    Function to drop columns above threshold max_na
    '''
    return df.drop(info[info['nan%']  > max_na].index, axis=1).copy()

In [0]:
def categorical_types(df):
    '''
    Our model requires to know where are the positions of each categorical variable
    Here we extract this info!

    returns
    : all_features - complete set of columns
    : categ_features -  boolean and object features 
    : num_features - numeric features
    : categ_bool - the whole index of columns stating True for categorical variables
    '''
    
    all_features = df.columns.to_list()
      
    categ_features = df.select_dtypes(['bool','object']).columns.to_list()
    num_features = df.select_dtypes(['int64','float']).columns.to_list()

    categ_loc = [df.columns.get_loc(x) for x in categ_features]
    categ_func = lambda x: True if x in categ_features else False
    categ_bool = [categ_func(x) for x in all_features]
    categ_idx = dict(zip(categ_features, categ_bool))

    print(f'{df.shape} \n\nNum:{num_features} \n\nCateg: {categ_features}')
    
    return (all_features,categ_features,num_features,categ_bool)

In [0]:
def get_user_companies(path,file):
    return pd.read_csv(path + file, index_col=0)

# Load data

In [0]:
path = '/content/drive/My Drive/codenation/'

In [0]:
market = pd.read_csv(path+'estaticos_market.csv', index_col =0)

In [11]:
infoDB = get_info(market)

Shape ::  (462298, 181)


# Clean data

In [12]:
# drop columns with more than 40% of missing values
max_percentual_na = 40
df = dropna_cols(market,infoDB,max_percentual_na).copy()

## drop companies withou 'setor',  'nm_divisao', 'nm_segmento'
print('Numero de empresas sem informações básicas (setor, nm_divisao, nm_segmento)',
df.loc[df.setor.isna(),['setor','de_ramo', 'nm_divisao', 'nm_segmento']].shape[0])

df.dropna(subset = ['setor',  'nm_divisao', 'nm_segmento'], axis = 0, inplace = True)

Numero de empresas sem informações básicas (setor, nm_divisao, nm_segmento) 1927


Since we have deleted some  companies from the markt dataset, we need to make sure we reset the index in order to avoid confusions when using ```df.iloc``` and ```df.loc```. 

In [0]:
df.reset_index(inplace=True, drop=True)

In [14]:
to_drop_age = ['idade_empresa_anos', 'dt_situacao']
to_drop_rescencia = ['nu_meses_rescencia']
to_drop_idade = ['qt_socios', 'idade_maxima_socios', 'idade_minima_socios']
to_drop_natureza = ['fl_optante_simei', 'fl_optante_simples']
to_drop_faturamento = ['vl_faturamento_estimado_aux','vl_faturamento_estimado_grupo_aux']
to_drop_extra = [ 'fl_email','fl_telefone','sg_uf_matriz']

to_drop = to_drop_age + to_drop_rescencia + to_drop_idade + to_drop_natureza + to_drop_faturamento + to_drop_extra

df.drop(columns = to_drop, inplace=True)

print('Number of cols dropped', len(to_drop))

Number of cols dropped 13


Let's get some info about our data:

In [15]:
features, categ_features, num_features, categ_bool = categorical_types(df)

(460371, 37) 

Num:['vl_total_veiculos_pesados_grupo', 'vl_total_veiculos_leves_grupo', 'empsetorcensitariofaixarendapopulacao', 'qt_socios_pf', 'qt_socios_pj', 'idade_media_socios', 'qt_socios_st_regular', 'qt_filiais'] 

Categ: ['id', 'fl_matriz', 'de_natureza_juridica', 'sg_uf', 'natureza_juridica_macro', 'de_ramo', 'setor', 'idade_emp_cat', 'fl_me', 'fl_sa', 'fl_epp', 'fl_mei', 'fl_ltda', 'fl_st_especial', 'fl_rm', 'nm_divisao', 'nm_segmento', 'fl_spa', 'fl_antt', 'fl_veiculo', 'de_saude_tributaria', 'de_saude_rescencia', 'de_nivel_atividade', 'fl_simples_irregular', 'nm_meso_regiao', 'nm_micro_regiao', 'fl_passivel_iss', 'de_faixa_faturamento_estimado', 'de_faixa_faturamento_estimado_grupo']


# Imputation of missing values

This is such a hard task, specially in the case where we are looking for similarities because every moment we input values we are approximating observations, and creating relations that may not really exist. So the ideal case would be to use a metric able to handle these missing values by ignoring them. For that we've tried to use the HEOM metric, this example is in the other file in the same folder, check it out!

Also, instead of just simple input it is interesting to try out iterative and knn impute, they are available in sklearn! 

Here we have chosen simple input for it's simplicity!

In [0]:
# Fill numeric values with the mean\median
df['idade_media_socios'].fillna(int(df['idade_media_socios'].mean()), inplace=True)
df['empsetorcensitariofaixarendapopulacao'].fillna(df['empsetorcensitariofaixarendapopulacao'].median(), inplace=True)
df['empsetorcensitariofaixarendapopulacao'] = np.log(df['empsetorcensitariofaixarendapopulacao'])

# Fill these with bussiness info
df['qt_socios_pf'].fillna(1, inplace=True)
df['qt_socios_pj'].fillna(0, inplace=True)
df['qt_socios_st_regular'].fillna(df['qt_socios_pf'], inplace=True)

# Fill nominal/ordinal categorical data with 'SEM INFORMACAO'
to_fill = ['de_faixa_faturamento_estimado','de_faixa_faturamento_estimado_grupo',
'de_nivel_atividade', 'de_saude_rescencia','nm_meso_regiao','nm_micro_regiao']

df['de_saude_tributaria'].fillna('CINZA')

for x in to_fill:
    df[x].fillna('SEM INFORMACAO', inplace=True)

# Encode Variables

Altough the metric we chose does not require any encoding, we put some efforts in encoding in order to evaluate other metrics later. 

In [0]:
#Encode the ordinal variables, following a ordinal ogic
dic_faturamento_estimado = {
    'SEM INFORMACAO' : 0,
    'ATE R$ 81.000,00' : 1,
    'DE R$ 81.000,01 A R$ 360.000,00' : 2,
    'DE R$ 360.000,01 A R$ 1.500.000,00':3,
    'DE R$ 1.500.000,01 A R$ 4.800.000,00':4,
    'DE R$ 4.800.000,01 A R$ 10.000.000,00':5,
    'DE R$ 10.000.000,01 A R$ 30.000.000,00':6, 
    'DE R$ 30.000.000,01 A R$ 100.000.000,00':7,
    'DE R$ 100.000.000,01 A R$ 300.000.000,00':8,
    'DE R$ 300.000.000,01 A R$ 500.000.000,00':9,
    'DE R$ 500.000.000,01 A 1 BILHAO DE REAIS':10,
    'ACIMA DE 1 BILHAO DE REAIS':11}

dic_idade = {
    '<= 1': 1,
    '1 a 5': 2,
    '5 a 10': 3,
    '10 a 15':4,
    '15 a 20':5,
    '> 20' :6}

dic_de_nivel_atividade = {
    'SEM INFORMACAO': 0,
    'MUITO BAIXA': 1,
    'BAIXA':2,
    'MEDIA':3,
    'ALTA': 4
    }    

dic_de_saude_rescencia = {'SEM INFORMACAO': 0,
  'ATE 3 MESES' : 1,
  'ATE 6 MESES' :2,
  'ATE 1 ANO':3,
  'ACIMA DE 1 ANO':4}

dic_de_saude_tributaria = {
  'CINZA': 0,
  'VERMELHO':1,
  'LARANJA': 2,
  'AMARELO':3,
  'AZUL':4,
  'VERDE':5}

# Map 
df['de_faixa_faturamento_estimado'] = df['de_faixa_faturamento_estimado'].map(dic_faturamento_estimado)
df['de_faixa_faturamento_estimado_grupo'] = df['de_faixa_faturamento_estimado_grupo'].map(dic_faturamento_estimado)
df['idade_emp_cat'] = df['idade_emp_cat'].map(dic_idade)
df['de_nivel_atividade'] = df['de_nivel_atividade'].map(dic_de_nivel_atividade)
df['de_saude_rescencia'] = df['de_saude_rescencia'].map(dic_de_saude_rescencia)
df['de_saude_tributaria'] = df['de_saude_tributaria'].map(dic_de_saude_tributaria)

In [0]:
# Encode the bool features
dic_bool = {
    'SIM': 1,
    'NAO': 0}

df['fl_rm'] = df['fl_rm'].map(dic_bool)

bool_features = [col for col in df.columns if col.startswith('fl_')]

# Treat them as integers
for feat in bool_features:    
    df[feat] = df[feat].astype(int)

In [0]:
# Encode the remaining categorical variables 
categorical_features = [
    'natureza_juridica_macro',
    'de_natureza_juridica',
    'de_ramo',
    'setor',
    'nm_divisao',
    'nm_segmento',
    'nm_meso_regiao',
    'nm_micro_regiao',
    'sg_uf']

# keep track of the encoding in the map_categories variable
map_categories = dict()
for feat in categorical_features:
    df[feat] = df[feat].astype('category')
    encode_categ = df[feat].cat.codes
    map_categories[feat] = dict(zip(encode_categ, df[feat]))
    df[feat] = encode_categ

In [20]:
df.head(5)

Unnamed: 0,id,fl_matriz,de_natureza_juridica,sg_uf,natureza_juridica_macro,de_ramo,setor,idade_emp_cat,fl_me,fl_sa,fl_epp,fl_mei,fl_ltda,fl_st_especial,fl_rm,nm_divisao,nm_segmento,fl_spa,fl_antt,fl_veiculo,vl_total_veiculos_pesados_grupo,vl_total_veiculos_leves_grupo,de_saude_tributaria,de_saude_rescencia,de_nivel_atividade,fl_simples_irregular,empsetorcensitariofaixarendapopulacao,nm_meso_regiao,nm_micro_regiao,fl_passivel_iss,qt_socios_pf,qt_socios_pj,idade_media_socios,qt_socios_st_regular,de_faixa_faturamento_estimado,de_faixa_faturamento_estimado_grupo,qt_filiais
0,a6984c3ae395090e3bee8ad63c3758b110de096d5d8195...,1,59,4,2,11,2,4,0,0,0,0,0,0,1,32,10,0,0,0,0.0,0.0,5.0,4,4,0,6.852961,6,48,1,2.0,0.0,44.0,2.0,4,4,0
1,6178f41ade1365e44bc2c46654c2c8c0eaae27dcb476c4...,1,17,3,5,20,4,2,0,0,0,1,0,0,1,3,3,0,0,0,0.0,0.0,0.0,3,2,0,6.150454,4,70,1,1.0,0.0,27.0,1.0,2,2,0
2,4a7e5069a397f12fdd7fd57111d6dc5d3ba558958efc02...,1,17,1,5,31,4,3,0,0,0,1,0,0,1,86,20,0,0,0,0.0,0.0,3.0,4,3,0,6.570182,2,43,1,1.0,0.0,32.0,1.0,1,1,0
3,3348900fe63216a439d2e5238c79ddd46ede454df7b9d8...,1,17,1,5,24,4,3,0,0,0,0,0,0,1,74,17,0,0,0,0.0,0.0,3.0,4,3,0,6.16485,2,43,1,1.0,0.0,36.0,1.0,2,2,0
4,1f9bcabc9d3173c1fe769899e4fac14b053037b953a1e4...,1,17,4,5,26,4,2,0,0,0,0,0,0,1,77,8,0,0,0,0.0,0.0,5.0,4,4,0,7.963474,6,48,1,1.0,0.0,42.0,1.0,2,2,0


In [21]:
get_info(df)

Shape ::  (460371, 37)


Unnamed: 0,types,nan,nan%,unique
id,object,0,0.0,460371
fl_matriz,int64,0,0.0,2
de_natureza_juridica,int8,0,0.0,66
sg_uf,int8,0,0.0,6
natureza_juridica_macro,int8,0,0.0,7
de_ramo,int8,0,0.0,32
setor,int8,0,0.0,5
idade_emp_cat,int64,0,0.0,6
fl_me,int64,0,0.0,2
fl_sa,int64,0,0.0,2


# Recommender

The function ```recommender_leads``` is the fnction responsible to select the leads thar are similar to the ```user_companies```. The metric used is the Gower coefficient, which is explained later.

```neighbors``` is the 'size' or number of companies to be considered close while we are evaluating the distances among companies from the market and the user portifolio.

Four different methods to *rate* the companies were proposed! Here it is relevant to say that the smaller the rating the better (they are more similar.)


```func0``` considers the average distance of one company thom the market to the user_portifolio

```func1``` considers the companies that apperead more times (in the neighbors region) to be upvoted

```func2``` considers the norm of the distance vector of each company in the neighborhood to the entire portifolio

```func3```  similarly to ```func3``` it considers the norm of the distance vector of each company to the entire portifolio, but it is now  weighted by the number of times it has been in the neighborgood (as ```func1```). By weight we mean "reducing" the distance. 

In [0]:
def gower_metric(user_companies, market, categ_bool):
  '''
  Calculates pairwise distance between the market database and the user portfolio 
  using the Gower Coefficient
  '''
  return gower.gower_matrix(market.iloc[:,1:], user_companies.iloc[:,1:], cat_features=categ_bool[1:])

In [0]:
def recommender_leads(user_companies, market, categ_bool, function='func3', neighbors = 50, max_leads = None):
    '''
    Rocommend leads based on similarities
    Uses Gower Coefficiente as metric

    param market :: market dataframe 
    param user_companies: dataframe with companies in the user porfolio
    param categ_bool: list of boolean required to indicate if a feature is categorical or not
    param k : numero de recomendações a serem geradas
    returns: index of recommended leads

    # Hyperparameters that define precision
    max_leads :  list size (the larger, more likely one of the portfolio companies to be in the recommended list)
    neighbors : number of nearby companies to be considered ''in the rating''

    '''
    gower_dist =  gower_metric(user_companies, market, categ_bool)
    pickle.dump(gower_dist,open(path+f'/gower_{len(user_companies)}.pkl','wb'))

    ind = []
    dist = []
    for i in range(len((user_companies.id))):
      ind_aux = np.argpartition(gower_dist[:,i], neighbors, axis=-1)[:neighbors]

      #remove user items
      valid_mask = np.isin(ind_aux , user_companies.index.values, assume_unique=True, invert=True)

      dist_aux = gower_dist[ind_aux[valid_mask],i]
      ind.append(ind_aux[valid_mask])
      dist.append(dist_aux)
  
    concat_ind = np.concatenate(ind)
    concat_dist = np.concatenate(dist)  

    ## Check for reapeted recomendation and count
    unique, counts = np.unique(concat_ind, return_counts=True) 

    # Score candidates
    relevance = [] 


    for n_times,i in sorted(zip(counts, unique), reverse=True):
      if function == 'func0':
        index_ = np.where(concat_ind == i)
        rating = np.mean(concat_dist[index_[0]])
      if function == 'func1':
        rating = 1/n_times
      elif function == 'func2':
        rating =  np.linalg.norm(gower_dist[i,:]) 
      elif function == 'func3':
        rating =  ((1/n_times)**2)*np.linalg.norm(gower_dist[i,:])  
      relevance.append([i, rating])

    ## Let's sort the best candidates and get the k best candidates
    best_candidates = sorted(relevance, key=lambda x: x[1])

    # Retrieving the indexes
    top_index = []
    for idx,val in best_candidates: 
        top_index.append(idx)

    if max_leads == None or max_leads > len(top_index):
      return top_index
    else:
      return top_index[:max_leads]

## Metrics

Although the solution for this reccomender system is quite simplistic, the approach to find the best metric involved a lot of research. 

In this dataset there are both categorical and numerical data types, and the two main questions are:
- How do we quantify composite (mixed) data types similarities in one setting?
  In case of label encoding categorical data we would end up giving numerical relations and weights that don't really exist and in this case, euclidean metric would not be suitable. 
- How to measure similarity/dissimilarity among categorical data?  
  One option is to encode all nominal features to dummies variables. That sounds very interesting, however we have categorical data with high cardinality, so it means we would be dealing with a problem of hundreds of variables and we would probabily have to face the curse of dimensionality. 
- Another key point in finding similarities is that we have to deal with transformation/normalization of the numerical data.

Said all that, in my research I have found **Gower’s coefficient**

>*Gower’s distance/measure/coefficient/similarity is a measure to find the similarity between two rows of a dataset consisting of mixed type attributes. It uses the concept of **Manhattan distance** for continuous variables and **Dice distance** for measuring similarity between Binary variables.*

[More details about it!](https://medium.com/analytics-vidhya/concept-of-gowers-distance-and-it-s-application-using-python-b08cf6139ac2)

[Why Gower distance?](https://stats.stackexchange.com/questions/15287/hierarchical-clustering-with-mixed-type-data-what-distance-similarity-to-use)

[](https://github.com/matchado/Misc/blob/master/gower_dist.py)

The interesting part about the Gower coefficient is that it does not require any encoding (not even transform nominal data) and numerical data are scale accordinly, and we can give weights to these variables in a very natural way. However, one must do value imputation.

A package develloped by [Michal Yan](https://github.com/wwwjk366/gower) from [Marcelo Beckmann work](https://sourceforge.net/projects/gower-distance-4python/files/).

You can install the package:
```
! pip install gower
```

# Testing  Hyperparameters

In [0]:
def test_model(user, df, categ_bool, function, neighbors, max_leads):
    '''
    Function to evaluate the model (recommended leads)
    '''
    user_companies = pd.merge(user[['id']], df, on='id', how='inner', left_index=True)
        
    all_companies = list(df['id'].unique())

    companies_train, companies_test = train_test_split(user_companies, test_size=0.3, random_state=0)

    leads_idx = recommender_leads(companies_train, df, categ_bool, function, neighbors, max_leads)

    leads = df.iloc[leads_idx,:]

    # evaluate the model
    precision = model_metrics(leads, companies_test)

    return leads.id, precision

In [0]:
def model_metrics(leads, user_companies):
    '''
    Evaluate the model according to precision
    '''
    TP = sum(leads['id'].isin(user_companies.id))  
    FP = user_companies.shape[0] -TP
    precision = (100*TP/(TP + FP))
    return precision

We have some parameter that can affect the results of our model evaluation.
They are:

```max_leads```  :: the number of leads to be generated

```nn``` :: the number of neighbors to be considered "close" (it can change the frequency companies appear as close)

```function``` :: this is related to the rating function where it gives a neighbor company a weight if it is at [x] times close to all the portifolio



In [0]:
user = 'estaticos_portfolio{}.csv'
function = 'func{}'
# max_leads = number of leads to be recommended
# nn = number of companies to be considered close

Obviously, bigger the number of leads higher are the chances of a current customer be part of the current portifolio. 

Okay then, since we have different datasets lenghts what about we parametrized some of these variables?

Let's define:

```
max_leads = 10*len(user_items)
NN = [0.1*len(user_items), len(user_items), 2*len(user_items), 5*len(user_items), 10*len(user_items)]
```


Now let's verify our best configuration testing!

There's no need to keep calculating the distance matrix every iteration, since it is only one matrix for the same user. You may save it in a file and later we will only load it as requested. 

In [29]:
for i in range(1,4):
  user_items = get_user_companies(path, user.format(i))
  
  print(f"\nPortfolio : {user.format(i)}")
  
  n_items = len(user_items)
  max_leads = 10*n_items
  param_grid = [int(0.1*n_items), int(0.5*n_items), n_items, 5*n_items, 10*n_items]
   
  for j in range(0,4):
    print(f'========== FUNCTION {j} ===========')
    
    for nn in param_grid:
        leads , precision = test_model(user_items, df, categ_bool, function.format(j) , nn, max_leads)
        
        # Save the leads
        #pd.DataFrame(leads).to_csv(f'portfolio{i}_{function.format(j)}_nn{nn}.csv', index = None)
        
        print(f'LEN {len(leads)} NN {nn}: {precision}')


Portfolio : estaticos_portfolio1.csv
LEN 5550 NN 55: 3.592814371257485
LEN 5550 NN 277: 4.191616766467066
LEN 5550 NN 555: 4.790419161676646
LEN 5550 NN 2775: 2.9940119760479043
LEN 5550 NN 5550: 3.592814371257485
LEN 5550 NN 55: 5.9880239520958085
LEN 5550 NN 277: 2.395209580838323
LEN 5550 NN 555: 2.9940119760479043
LEN 5550 NN 2775: 4.191616766467066
LEN 5550 NN 5550: 2.9940119760479043
LEN 5550 NN 55: 4.191616766467066
LEN 5550 NN 277: 1.7964071856287425
LEN 5550 NN 555: 1.7964071856287425
LEN 5550 NN 2775: 1.1976047904191616
LEN 5550 NN 5550: 1.1976047904191616
LEN 5550 NN 55: 3.592814371257485
LEN 5550 NN 277: 2.395209580838323
LEN 5550 NN 555: 2.9940119760479043
LEN 5550 NN 2775: 3.592814371257485
LEN 5550 NN 5550: 2.9940119760479043

Portfolio : estaticos_portfolio2.csv
LEN 5660 NN 56: 67.6470588235294
LEN 5660 NN 283: 8.823529411764707
LEN 5660 NN 566: 1.1764705882352942
LEN 5660 NN 2830: 0.0
LEN 5660 NN 5660: 6.470588235294118
LEN 5660 NN 56: 83.52941176470588
LEN 5660 NN 28

## Analysing the results 

### Portfolios
Three different portfolios bring very distinct results, it may be because of the variance of features that characterize a user's customer. 
It may also happen because of the many features that have been excluded due to missing values and they may have had important attributes that could have described those customers better.

In any case, two *takeaways* from what we have done here are:
1.   to consider the integration of  **knowledge-based recommender systems** where qualitative information can be used to improve feature selection;
2.   Apply weight functions based on feature variance (lower variance, means it characterizes better the user --make sure to find a meaningful metric for categorical features). In this case, weights can be inserted very naturaly in the ```gower.gower_matrix(weights = [weight1, weight2, weight3...])```.


### Neighbors:

An interesting conclusion is that increase of the number os companies to be considered neighbors does not garantee an improvement in the results, mainly because many companies that may appear frequently are relatively far and it brings noise to the *rating* ```functions```. 

### Functions:
```func0``` using only the average distance does not seem to be a realible metric, as the number of companies in the neighborhood increases, the average distance looses its meaning and the results tend to get worse

```func1``` this function shows good results, upvoting companies that appear frequently is one of the principles of Nearest Neighbors

```func2``` the norm of the distance may bring a more homogeneous  overral result and we can see that there isn't much difference among the 4 differents number of neighbors tested.

```func3``` this function is also based in the norm (```func2```) but it also brings the features from ```func1```, increasing the relevance if a company has been present more times in the neighborhood than others.

```func1, func2, func3``` all have shown good performance (except ```func0```)! And any of them would be a good choice. 

### Conclusions


 ```'estaticos_portfolio1.csv'``` shows a very poor precison comparatively to others, by the reasons we have explained above. In this setting the best configuration was:
 ```' FUNCTION 1 LEN 5550 NN 55: 5.9880239520958085'```

```'estaticos_portfolio2.csv'``` shows a good precision, even though it has $\approx$ the same size of ```'estaticos_portfolio1.csv'```, its users share more similarites, and in a bussiness perspective it means they have a very well defined customer profile. In this setting the best configuration was:
```' FUNCTION 3 LEN 5660 NN 56: 88.82352941176471'```

```'estaticos_portfolio3.csv'``` presented the best results among all and it is also the smallest dataset. In this setting the best configuration was:
```' FUNCTION 3 LEN 2650 NN 132: 91.25'```

Finally ```func1, func3``` have performed the best, it is because both consider the number of times a company has been present. Obviously the size of this neighborhood influences these results, and in that case ```func2``` shows a more homogenous result while the neighborhood increases.

# Recommend leads

As parameter for our leads recommender system we set ```func3``` as our main function and the number of neighbors as ```neighbors = 0.5*len(user_items)```, and ```max_leads = 10*len(user_items)```

In [0]:
import pickle

In [29]:
for i in range(1,4):

  user_items = get_user_companies(path, user.format(i))
  
  print(f"\nFinding leads to : {user.format(i)}")

  n_items = len(user_items)
  neighbors = int(0.5*len(user_items))
  max_leads = 10*n_items

  user_companies = pd.merge(user_items[['id']], df, on='id', how='inner', left_index=True)

  leads_index = recommender_leads(user_companies, df, categ_bool, 'func3' , neighbors, max_leads)
  
  leads = df.iloc[leads_index,:]

  # Save leads
  print(f'Saving {len(leads_index)} leads in ~/portfolio{i}_leads.csv')
  #leads[['id']].to_csv(path+f'portfolio{i}_leads.csv', index = None)


Finding leads to : estaticos_portfolio1.csv
Saving 5550 leads in ~/portfolio1_leads.csv

Finding leads to : estaticos_portfolio2.csv
Saving 5660 leads in ~/portfolio2_leads.csv

Finding leads to : estaticos_portfolio3.csv
Saving 2650 leads in ~/portfolio3_leads.csv


### Profiling the recommended leads 

In [67]:


a = get_user_companies(path, portfolio_leads.format(3))
a 

545338bfcb8b249a9a949df9e5ff87febe7eabe501831784e6017e6a7d5b03d4
8ecbdec73b7b8c4d89eb8e9eb7caf21d16851294d4daca94302234a13504b45e
3eeab91fce96724fa85bb6f5125325f87e8d29af2cde656042c662e359b335f7
e54e3b10d13af88b1796ba477aaa355856798fa5b96bd2ed318b28aa3874cffc
ad2d9879a8b2ef5328ec3bc710c95828b4c43fb64ec9ae1fe68f4f1d572861c6
...
49338f7e64fc96aaf54b240b0830839c09e6cb7bc7b34d5d261d0df11f6a2f61
2dc2cddb1c520997189e1c786e0ac9cc0ae60de1a1a60f7994c8d94b4a875355
4c88b828740c3a82306d10db67ffa660fa399f1d6bc7fa478008d71032cd348f
64066bc083dd66c75c98234d4254d40140e7e5d98350f499330cd94234dd2dfc
5321d7a4618de003c45a72b7c1dc6505162b09ee465c135af1d70f3d216be7c7


## Future developments!

- Create weights for features  according to user historic data (evaluate low variance - meaning it is a feature that represents well its profile)!

- Add filter criterias after selecting leads, according to the users need, for example:

> `de_saude_tributaria`: character, indicator of health tax status, Green if all tax are OK, Red if none are OK

> `de_saude_rescencia`: character, indicates time of update of the most lagged input of the indicator

> `de_nivel_atividade`:character, probability of being operating, ALTA high probality, BAIXA, low probality

> `fl_st_especial`: boolean value, true if dt_situacao_especial is not null. If it is not null means that some extraordinary situation is identified by the IRS (ESPOLIO DE EMPRESARIO EMPRESA INDIVIDUAL OU EIRELI, FALIDO, EM LIQUIDACAO, LIQUIDACAO JUDICIAL, LIQUIDACAO EXTRA JUDICIAL, REGISTRO NA JUNTA COMERCIAL EM ANDAMENTO, EM LIQUID EXTRA JUDICIAL, RECUPERACAO JUDICIAL, INTERVENCAO)

In [0]:
import plotly.express as px
px.defaults.width = 1000
px.defaults.height = 500

In [63]:
user_items = get_user_companies(path, user.format(1))
cols = ['setor', 'de_ramo', 'natureza_juridica_macro']
value = 'idade_empresa_anos'  

user_companies = pd.merge(user_items[['id']], market, on='id', how='inner', left_index=True)

px.parallel_categories(user_companies , 
                       dimensions = cols,
                       color=value, 
                       color_continuous_scale=px.colors.sequential.Inferno)

In [64]:
portfolio_leads = 'portfolio{}_leads.csv'

cols = ['setor', 'de_ramo', 'natureza_juridica_macro']
value = 'idade_empresa_anos'

portfolio_leads_ = get_user_companies(path, portfolio_leads.format(1)).reset_index()
user_leads = pd.merge(portfolio_leads_[['id']], market, on='id', how='inner', left_index=True)
px.parallel_categories(user_leads.iloc[:2*len(user_companies)] , 
                       dimensions = cols,
                       color=value, 
                       color_continuous_scale=px.colors.sequential.Inferno)

In [65]:
user_items = get_user_companies(path, user.format(2))
cols = ['setor', 'de_ramo', 'natureza_juridica_macro']
value = 'idade_empresa_anos'  

user_companies = pd.merge(user_items[['id']], market, on='id', how='inner', left_index=True)

px.parallel_categories(user_companies , 
                       dimensions = cols,
                       color=value, 
                       color_continuous_scale=px.colors.sequential.Inferno)

In [67]:
portfolio_leads = 'portfolio{}_leads.csv'

cols = ['setor', 'de_ramo', 'natureza_juridica_macro']
value = 'idade_empresa_anos'

portfolio_leads_ = get_user_companies(path, portfolio_leads.format(2)).reset_index()
user_leads = pd.merge(portfolio_leads_[['id']], market, on='id', how='inner', left_index=True)
px.parallel_categories(user_leads.iloc[:len(user_companies)] , 
                       dimensions = cols,
                       color=value, 
                       color_continuous_scale=px.colors.sequential.Inferno)

In [60]:
user_items = get_user_companies(path, user.format(3))
cols = ['setor', 'de_ramo', 'natureza_juridica_macro']
value = 'idade_empresa_anos'  

user_companies = pd.merge(user_items[['id']], market, on='id', how='inner', left_index=True)

px.parallel_categories(user_companies , 
                       dimensions = cols,
                       color=value, 
                       color_continuous_scale=px.colors.sequential.Inferno)

In [62]:
portfolio_leads = 'portfolio{}_leads.csv'

cols = ['setor', 'de_ramo', 'natureza_juridica_macro']
value = 'idade_empresa_anos'

portfolio_leads_ = get_user_companies(path, portfolio_leads.format(3)).reset_index()
user_leads = pd.merge(portfolio_leads_[['id']], market, on='id', how='inner', left_index=True)
px.parallel_categories(user_leads.iloc[:2*len(user_companies)] , 
                       dimensions = cols,
                       color=value, 
                       color_continuous_scale=px.colors.sequential.Inferno)

In [38]:
portfolio_leads_

Unnamed: 0,id
0,545338bfcb8b249a9a949df9e5ff87febe7eabe5018317...
1,8ecbdec73b7b8c4d89eb8e9eb7caf21d16851294d4daca...
2,3eeab91fce96724fa85bb6f5125325f87e8d29af2cde65...
3,e54e3b10d13af88b1796ba477aaa355856798fa5b96bd2...
4,ad2d9879a8b2ef5328ec3bc710c95828b4c43fb64ec9ae...
...,...
2645,49338f7e64fc96aaf54b240b0830839c09e6cb7bc7b34d...
2646,2dc2cddb1c520997189e1c786e0ac9cc0ae60de1a1a60f...
2647,4c88b828740c3a82306d10db67ffa660fa399f1d6bc7fa...
2648,64066bc083dd66c75c98234d4254d40140e7e5d98350f4...
