In [1]:
import pandas as pd
import numpy as np

import ipywidgets as widgets
from ipywidgets import AppLayout
import IPython.display as pyDis

from context import algorithms
from algorithms.clustering.similarityCommunityDetection import SimilarityCommunityDetection
from algorithms.clustering.explainedCommunitiesDetection import ExplainedCommunitiesDetection
from algorithms.visualization.gephiVisualization import GephiVisualization

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data_df = pd.read_csv('../../data/MNCN/user_profiles.csv')

# Datos necesarios para los filtros
q103_index = list(range(1,7))
q107a_index = list(range(7, 13))
q107b_index = list(range(13, 16))
q110a_index = list(range(16, 18))
q110b_index = list(range(18, len(data_df.columns)))

# Questions
questions = {
    'q103': q103_index,
    'q107a': q107a_index,
    'q107b': q107b_index,
    'q110a': q110a_index,
    'q110b': q110b_index
}

# Diccionario de respuestas
answers = {
    'q103_0': 'Reduce shower time',
    'q103_1': 'Buying less clothes',
    'q103_2': 'Do not use products with a lot of packaging',
    'q103_3': 'Walking to more places',
    'q103_4': 'Reduce my waste generation',
    'q103_5': 'Recycle correctly',
    'q107a_0': 'Transport: By car',
    'q107a_1': 'Transport: Walking',
    'q107a_2': 'Transport: By bike',
    'q107a_3': 'Transport: By bus',
    'q107a_4': 'Transport: By underground',
    'q107a_5': 'Transport: By En scooter',
    'q107b_0': 'I would be willing to change means of transport',
    'q107b_1': 'I would not be willing to change means of transport',
    'q107b_2': 'Perhaps I would be willing to change means of transport',
    'q110a_0': 'I had an exotic pet',
    'q110a_1': 'I did not have an exotic pet',
    'q110b_0': 'Adopted: florida tortoise',
    'q110b_1': 'Adopted: common dog',
    'q110b_2': 'Adopted: common cat',
    'q110b_3': 'Adopted: Argentine parrot',
    'q110b_4': 'Adopted: cockatoo',
    'q110b_5': 'Adopted: capuchin monkey',
}

In [3]:
distances = None

In [4]:
def visualize_in_Gephi(communities):
    
    gv = GephiVisualization(workspace='mncn-1')
    
    data_df['community'] = communities.values()
    users = data_df[['UserId', 'School', 'Grade', 'Type', 'Zone', 'community']].values
    
    data_to_sim = data_df.iloc[:,indexes].values
    distances = cosine_similarity(data_to_sim)
    gv.load_community(users, distances, users_properties=['School', 'Grade', 'Type', 'Zone', 'community'])

In [5]:
def search_communities(data, percentage, indexes):
    
    # Apply algorith to detect communities
    community_detection = ExplainedCommunitiesDetection(data, SimilarityCommunityDetection, 'cosine')
    n_communties, users_communities = community_detection.search_all_communities(answer_binary=True, percentage=percentage)
    
    # Explain communities
    users_without_community = []
    for c in range(n_communties):
        community_data = community_detection.get_community(c, answer_binary=True)
        
        if len(community_data['members']) > 1:
        
            print('---------------------')
            print('COMMUNITY -', community_data['name'])
            print('\t- N. Members:', len(community_data['members']))
            print('\t- Properties:')

            for k in community_data['properties'].keys():
                print('\t\t-', answers[k])
        else:
            users_without_community.extend(community_data['members'])
            
    print('---------------------')
    print('N. USERS WITHOUT COMMUNITY -', len(users_without_community))
    
    # Incluimos la los datos las comunidades
    data_df['community'] = users_communities.values()
    
    # Filtramos las comunidades con menos de 2 usuarios
    filter_groups = data_df.groupby(by='community').count()['UserId'] < 2
    groups_to_filter = filter_groups.index.values[filter_groups]
    users_out = list(data_df[data_df['community'].isin(groups_to_filter)]['UserId'].values)
    
    # Pintamos en Gephi
    gv = GephiVisualization(workspace='mncn-1')
    
    # Preparamos los datos de usuarios y distancias
    filtered_data_df = data_df[~data_df['UserId'].isin(users_out)]
    users = filtered_data_df[['UserId', 'School', 'Grade', 'Type', 'Zone', 'community']].values
    users_index = list(filtered_data_df.index.tolist())
    data = data_df.iloc[users_index,indexes]
    distances = cosine_similarity(data)
    
    gv.load_community(users, distances, users_properties=['School', 'Grade', 'Type', 'Zone', 'community'])

In [6]:
def btn_event(obj):
    
    # Incluimos los índices de las preguntas seleccionadas
    indexes = list()
    
    for q in questions_wid.value:
        indexes.extend(questions[q])
    
    # Filtramos el dataset, seleccionando solo las preguntas que queremos
    data = data_df.iloc[:,indexes]
    print(data)
    
    percentage = percentage_wid.value
    
    search_communities(data, percentage, indexes)

In [7]:
# Preparo la interfaz
questions_wid = widgets.SelectMultiple(
    options=list(questions.keys()),
    descriptions='Seleccionar preguntas',
    disable=False
)

percentage_wid = widgets.FloatSlider(
    value=0.94,
    min=0.0,
    max=1.0,
    step=0.01,
    description='Min % respuestas comunes'
)

button = widgets.Button(
    description = 'Buscar'
)

button.on_click(btn_event)
AppLayout(hader=None, left_sidebar=questions_wid, center=None, right_sidebar=percentage_wid, footer=button)

AppLayout(children=(Button(description='Buscar', layout=Layout(grid_area='footer'), style=ButtonStyle()), Sele…

    q103_0  q103_1  q103_2  q103_3  q103_4  q103_5  q107a_0  q107a_1  q107a_2  \
0      1.0     0.0     1.0     0.0     0.0     1.0      1.0      0.0      0.0   
1      0.0     0.0     1.0     1.0     0.0     1.0      1.0      0.0      0.0   
2      0.0     0.0     1.0     1.0     0.0     1.0      1.0      0.0      0.0   
3      1.0     0.0     1.0     1.0     0.0     0.0      0.0      1.0      0.0   
4      1.0     0.0     1.0     1.0     0.0     0.0      0.0      1.0      0.0   
..     ...     ...     ...     ...     ...     ...      ...      ...      ...   
56     1.0     1.0     0.0     1.0     0.0     0.0      0.0      0.0      0.0   
57     1.0     0.0     1.0     0.0     0.0     1.0      0.0      0.0      0.0   
58     0.0     1.0     0.0     0.0     0.0     0.0      0.0      0.0      0.0   
59     1.0     0.0     0.0     1.0     1.0     0.0      0.0      0.0      0.0   
60     0.0     0.0     0.0     0.0     0.0     0.0      0.0      1.0      0.0   

    q107a_3  q107a_4  q107a

HandshakeError: Invalid response status: b'404' b'Not Found'