In [None]:
ID_COLS = ['rut']

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

### Data

In [None]:
# read     
X_seed = pd.read_pickle("data/X_seed.pkl")
X_poblacion = read_pickle("data/X_poblacion.pkl")
# scores
df_scores = pd.read_csv('data/df_scores.csv')
# feat importance
df_importance = pd.read_csv('data/df_importance.csv')

### Feature importance

In [None]:
df_importance.head(10)

### Distribución del score

Analizamos la distribución del score en la población

In [None]:
scores = df_scores['score']

In [None]:
scores.describe()

In [None]:
scores.value_counts(bin=10, sort=False)

In [None]:
scores.value_counts(bin=10, sort=False, normalize=True)

In [None]:
sns.distplot(scores)

In [None]:
kwargs = {'cumulative': True}
sns.distplot(scores, hist_kws=kwargs, kde_kws=kwargs)

### Cantidad de clones según el score

Visualizamos cuántos clones obtendríamos para cada posible punto de corte del score (*score umbral*)

In [None]:
def plot_score_clones(df_scores):
    """
    plot cantidad de clones segun umbral de score
    """
    df_plt = df_scores.sort_values('score', ascending=True)
    df_plt['clones'] = np.arange(1, df_scores.shape[0]+1)
    df_plt = df_plt.groupby('score').tail(1)
    ax = sns.lineplot(data=df_plt, x='score', y='clones', ci=None)
    plt.title('Clones segun umbral de score')
    return ax

In [None]:
plot_score_clones(df_scores)

Definimos qué umbral de score se corresponde con una cantidad determinada de clones.

In [None]:
def score_umbral(df_scores, n_clones):
    """Return score umbral que se corresponde con n_clones"""
    df_tmp = df_scores.sort_values('score', ascending=False)
    scores_tmp = df_tmp['score'].head(n_clones).tolist()
    return scores_tmp[-1]

In [None]:
n_clones = 25_000
print(f'{n_clones} clones: score = {score_umbral(df_scores, n_clones)}'')

### Distribuciones comparadas

Visualizamos las distribuciones de los atributos entre semilla-clones-resto, para una cantidad de clones determinada.
En este caso lo hacemos para los atributos más importantesen los clasificadores bagging.

In [None]:
n_clones = 25_000
top_features = df_importance['feature'].head(6).tolist()
print(top_features)

#### Estadisticos

In [None]:
def long_data(X_poblacion, X_seed, df_scores, n_clones, feature_name):
    """
    Return long data with variable, value and group segun n_clones
    """
    df_poblacion = pd.concat([X_poblacion, df_scores['score']], axis=1)
    df_poblacion.sort_values('score', ascending=False, inplace=True)
    df_poblacion['n'] = np.arange(1, df_poblacion.shape[0]+1)
    df_poblacion['grupo'] = np.where(df_poblacion['n'] <= n_clones, 'clones', 'resto')
    df_seed = X_seed
    df_seed['score'] = np.nan
    df_seed['grupo'] = 'semilla'
    df_tmp = pd.concat([df_poblacion, df_seed])
    df_tmp = df_tmp[['grupo'] + feature_names]
    df_long = pd.melt(df_tmp, id_vars='grupo', var_name='variable', value_name='value')
    df_long['grupo'] = pd.Categorical(df_long['grupo'], ['semilla'], ['clones'], ['resto'])
    return df_long

In [None]:
df_tmp = long_data(X_poblacion, X_seed, df_scores, n_clones, top_features)
df_tmp.groupby(['variable', 'grupo']).describe()

#### Densidades

In [None]:
def densities_lookalike(X_poblacion, X_seed, df_scores, n_clones, feature_name):
    """
    Densidades por variables segun grupo para n_clones definido
    """
    df_plt = long_data(X_poblacion, X_seed, df_scores, n_clones, feature_name)
    g = sns.FacetGrid(df_plt, col='variable', hue='grupo'
                     ,hue_order=['semilla', 'clones', 'resto']
                     ,sharex=False, sharey=Flase, col_wrap=3)
    g.map(sns.distplot, 'value')
    g.add_legend()
    return g

In [None]:
densities_lookalike(X_poblacion, X_seed, df_scores, n_clones, top_features)

#### Boxplots

In [None]:
def boxplots_lookalike(X_poblacion, X_seed, df_scores, n_clones, feature_name):
    """
    Boxplots por variables segun grupo para n_clones definido
    """
    df_plt = long_data(X_poblacion, X_seed, df_scores, n_clones, feature_name)
    g = sns.catplot(x='grupo', y='value', col='variable'
                     ,data=df_plt, kind='box', showfliers=False
                     ,sharey=Flase, col_wrap=3)
    g.add_legend()
    return g

In [None]:
boxplots_lookalike(X_poblacion, X_seed, df_scores, n_clones, top_features)