In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import scipy.stats
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import itertools
import time
import umap
import paths


## FLUJO:

### Procesamiento

Inputs: 
        Dataframe de interacción (Playlists, id)
        Dataframe de información (id, nombre)
Procesamiento:
        Usando las funciones de sampleo se samplean las interacciones con muchos o muy pocos elementos (sample middle users)
        Posteriormente se samplean las interacciones que solo contienen elementos con muchas o muy pocas interacciones(Sample middle elements)
        El proceso se encuentra empaquetado en la función particular elements sample, que retorna el diccionario elements_map
        keys: elementos
        Values: Usuarios que lo evaluaron bien (playlists que contienen la canción)

### Construcción de grafo

La clase Cover contiene todos los elementos para realizar la filtración de Steinhaus:
    Las coberturas serán el diccionario elements_map, es decir, la cobertura de un elemento será el conjunto de usuarios que evaluo bien el elemento (la cobertura de la canción es el conjunto de elemento que los contiene)
    La distancia entre dos elementos es la distacia de Jaccard de las coberturas
    Este proceso esta empaquetado en el método build que retorna los simplices (conjuntos de conjuntos de elementos con la distancia de sus coberturas entre ellos)
    
Finalmente el grafo se construye con la función Build Graph:
    La distancia entre los elementos (tiempo que tardan en aparecer en la filtración) será el valor de las aristas
    El id de los elementos serán nodos
    
### Procesamiento de caminos

Esto se encuentra en el módulo Paths, particularmente la función most_stable. En utils se encuentran las funciones elements_name_in_path y print, que buscan pasar los id a nombres de elemento en base al dataframe de información

### Loaders

Se agregan funciones utilizadas para cargar los dataframes utilizados



## Steinhaus Filtration Functions

In [None]:


#from cechmate import Cover
class BaseFiltration:
    def __init__(self, max_dim=3):
        
        self.max_dim = max_dim
        
        
class Cover(BaseFiltration):
    """
    Class that make the filtration for the graph
    
    """
    
    def __init__(self, max_dim):
        """
        max_dim: max dim of simplices 
        """
        super().__init__(max_dim)
        self.covers = None
        self.simplices = None
        self.dists = []  
    
    def build(self, covers: dict):
        """
        Input: Dictionary of element and its covers (user that rated positively the element)
        Returns: Simplices
        """
        

        # Give each cover element a name.
        if not isinstance(covers, dict):
            covers = dict(enumerate(covers))
            
        simplices = [([k], 0.0) for k in covers.keys()]
        
        # TODO: be more intelligent about which combos we check

        #Changed self.max_dim+1 to self.max_dim+2. Correct?
        for k in range(2, self.max_dim + 2):
            for potentials in itertools.combinations(covers.keys(), k):
                potential_sets = [covers[p] for p in potentials]

                d = self.jaccard(potential_sets)
                self.dists.append(d)

                # TODO: Do we want to include all of these simplices as well?
                if d < 1:
                    simplices.append((potentials, d))
        
        self.covers = covers
        self.simplices = simplices
        self.d = d
        
        return simplices

    def jaccard(self, covers):
        """
        Input: covers
        Returns: Jaccard Distance between covers
        """
        
        covers_as_sets = list(map(set, covers))
        intersection = set.intersection(*covers_as_sets)
        union = set.union(*covers_as_sets)

        return 1 - len(intersection) / len(union)



## Sample of interaction matrix functions

In [2]:
def sample_middle_elements(ratings: np.ndarray, lower=0.5, upper=0.95):
    """ 
    
    Sample just given percentiles of elements
    Input: User interaction Matrix (binary)
    
    """
    movies = ratings.groupby('id')
    movies_map = {name: list(group.pid) for name, group in movies}

    movies_ratings_n = [len(m) for m in movies_map.values()]
    lower_bound, upper_bound = pd.Series(movies_ratings_n).quantile([lower, upper]).values
    print(f"Remove movies with # ratings <= {lower_bound} and # ratings >= {upper_bound}")

    movies_map = {name: mlist for name, mlist in movies_map.items() if lower_bound <= len(mlist) <= upper_bound}
    print(f"Was {len(movies_ratings_n)}, now {len(movies_map)} movies in the range.")
    
    return movies_map

def sample_middle_users(ratings:np.ndarray, lower=0.6, upper=0.95):
    """ 
    
    sample middle given percentiles of users
    Input: User interaction Matrix (binary)
    
    """
    users = ratings.groupby('pid')
    users_map = {name: list(group.id) for name, group in users}

    users_ratings_n = [len(m) for m in users_map.values()]
    lower_bound, upper_bound = pd.Series(users_ratings_n).quantile([lower, upper]).values
    print(f"Remove movies with # ratings <= {lower_bound} and # ratings >= {upper_bound}")

    users_map = {name: ulist for name, ulist in users_map.items() if lower_bound <= len(ulist) <= upper_bound}
    print(f"Was {len(users_ratings_n)}, now {len(users_map)} users in the range.")
    
    return users_map

def particular_elements_sample(ratings: np.ndarray, elements_info:pd.DataFrame, 
                               genre_to_use = 'hard rock', n=15000, sample = True, genre = True, ):
    
    """
    
    Sample and filter the elements to use to ~n
    Ratings: User interaction Matrix (binary)
    Elements_info: Information about elements dataframe (genre, name, etc)
                   Needs to contain id element amd genre element to filter by genre
    
    """
    
    # limit to only middle percentile users 
    users_map = sample_middle_users(ratings)
    ok_users = list(users_map.keys())
    sub_ratings = ratings[(ratings['id'].isin(ok_users))]
    
    # Filter only particular movies
    print(f"--{len(sub_ratings)} movie reviews after filtering users")
    sub_ratings = sub_ratings[sub_ratings['id'].isin(elements_info.id)]
    print(f"--{len(sub_ratings)} movie reviews after filtering good movies")
    
    # Get movies with middle percentiles          
    movies_map = sample_middle_movies(sub_ratings)
    print(len(movies_map))
    print("Sample movies")

    # then randomly sample the movies
    np.random.seed(1)
    
    
    movies_map = {name: list(group.pid) for name, group in movies}
    movies_ratings_n = [len(m) for m in movies_map.values()]
    
    if genre == True:
        
        mask = elements_info.loc[elements_info.genres.str.contains(genre_to_use)]
        
        movies_map = {m:l for m,l in movies_map.items() 
                     if m in mask['id'].values}
    if sample == True:   
        rate = n / len(movies_map)
        print(rate)
        
        movies_map = {m:l for m,l in movies_map.items() if np.random.random() <= rate}
    #or name in good_movies
    print(f"Resulting dataset has {len(movies_map)} movies")
    return movies_map

## Graph generation functions

In [None]:
def thresh_filt(filtration, thresh):
    filtration = [f for f in filtration if f[1] < thresh]
    return filtration
    
def build_graph(filtration, thresh=None):
    """ Build a networkx graph out of the 1-skeleton in the filtration
    """
    if thresh:
        filtration = thresh_filt(filtration, thresh)
    vertices = [s[0][0] for s in filtration if len(s[0]) == 1]
    edges = [s[0] for s in filtration if len(s[0]) == 2]
    edge_attrs = {s[0]: s[1] for s in filtration if len(s[0]) == 2}
    edge_weights = list(edge_attrs.values())
    
    g = nx.Graph()
    g.add_nodes_from(vertices)
    g.add_edges_from(edges)
    nx.set_edge_attributes(g, edge_attrs, 'dist')

    return g

## Utility functions

In [None]:
def plot_pareto2(most_stable, all_stable_paths, save=False):
    
    # stabs = [p[1] for p in most_stable.values()]
    # lens = list(most_stable.keys())
        
    unoptimal_paths = np.array([(n, s[1]) for n, ps in all_stable_paths.items() for s in ps])
    optimal_paths = np.array([(n,ps[1]) for n, ps in most_stable.items()])

    ax = plt.figure(figsize=(20,10)).gca()  
    ax.xaxis.set_major_locator(MaxNLocator(integer=True)) 
    plt.subplot(111)
    
    matplotlib.rc('xtick', labelsize=20) 
    matplotlib.rc('ytick', labelsize=20) 

    cmap = plt.get_cmap('tab10')
    plt.scatter(unoptimal_paths[:,0], unoptimal_paths[:,1], color=cmap(1), label='Nonoptimal paths')
    plt.scatter(optimal_paths[:,0], optimal_paths[:,1], label='Optimal paths')
    plt.plot(optimal_paths[:,0], optimal_paths[:,1], label='Pareto frontier')


    plt.xlabel("Length of path")
    plt.ylabel("Instability of path")

    plt.legend()
    ticks = range(int(min(optimal_paths[:,0])), int(max(optimal_paths[:,0])) + 1)
    plt.xticks(ticks)
    if save!=False:
        plt.savefig(save,)
        
        
def element_names_in_path(ms, elements_info):
    """
    Returns names of elements of most stable paths
    ms: Most stable paths list
    """
    elements_in_path = ms[0]
    pms = [elements_info.loc[elements_info.id == m].track_name.values[0] for m in elements_in_path]
    return pms


def print_names_in_path(pms):
    for i in pms.keys()
    print("\n {0} elements path: \n".format(i))
    print("\n".join(element_names_in_path(most_stable[i])))
    

## Loaders

In [None]:
def load_ratings(path, csv):
    """Load and preprocess ratings dataframe (interaction matrix)"""
    ratings = pd.read_csv(os.path.join(path, csv), delimiter=",")
    ratings = ratings.drop(columns = 'Unnamed: 0')
    return ratings

def load_info(path, csv):
    """Load and preprocess information dataframe """
    info = pd.read_csv(os.path.join(path_to_ml_20m, csv), delimiter=",")
    info = info.dropna()
    info = info.drop(columns = 'id.1')