In [None]:
# Install
!pip install prince -q

In [None]:
#Setup
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import itertools
import re
import unidecode
import math
from IPython.display import display

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from prince import MFA

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# General functions
def select_columns(dataset, app_dict_cols, to_dummy=True):
    
    questions = app_dict_cols.keys()
    
    multiple_choice = {k: v for k, v in app_dict_cols.items() if all_questions_dict[k]['type'] == 'multiple'}
    unique_choice = {k: v for k, v in app_dict_cols.items() if all_questions_dict[k]['type'] == 'unique'}
    
    # Multiple choice questions
    cols_per_question = {name: list(all_questions_dict[question]['columns'].keys()) for question, name in multiple_choice.items()}
    question_names = {item: name for name, sublist in cols_per_question.items() for item in sublist}

    names_per_question = [all_questions_dict[a]['columns'] for a in multiple_choice.keys()]
    question_options = {}
    for option_dict in names_per_question:
        question_options.update(option_dict)

    renamer = {}
    for k, v in question_names.items():
        new_name = v
        if k in question_options and question_options[k]:
            new_name += '__' + question_options[k].lower().strip().title().replace(' ', '').split('(')[0]
        renamer[k] = new_name

    multiple_df = dataset.loc[:, renamer.keys()].rename(columns=renamer)
    multiple_df.index = dataset.index
    for c in multiple_df.columns.tolist():
        multiple_df.loc[:, c] = (~multiple_df[c].isnull()).astype(int)
        
    
    # Single choice questions
    
    df = dataset.loc[:,unique_choice.keys()].rename(columns=unique_choice)
    
    for c in df.columns.tolist():
        df.loc[:, c] = df[c].str.split('(', expand=True).iloc[:, 0].str.lower().str.strip().str.title().str.replace(' ', '')
    single_df = pd.get_dummies(df, prefix_sep='__')
    single_df.index = survey_df.index
    
    final_df = pd.concat([multiple_df, single_df], axis=1)
        
    return final_df


def complete_nones(original_df):
    # To users who didn't signup all questions, lead them to None option
    # for any of the questions in this subset, didn't signaled any of the options

    cols_map = {}
    for c in original_df.columns.tolist():
        dim = c.split('__')[0]
        if dim in cols_map:
            cols_map[dim].append(c)
        else:
            cols_map[dim] = [c]

    new_df = original_df.copy()

    skippers_index = []
    for dim, cols in cols_map.items():
        df = original_df.loc[:, cols].sum(axis=1).to_frame()
        df.loc[:, 'empty'] = (df.iloc[:,0] == 0)    
        none_indexes = df.loc[df['empty']].index.tolist()

        none_col =  dim + '__None'
        if none_col in new_df.columns.tolist():
            new_df.loc[new_df.index.isin(none_indexes), none_col] = 1

    return new_df


def test_cluster_size(dataset, to_print=False):
    inertias = {}
    for k in range(1, 25):
        if to_print:
            if k % 5 == 0:
                print(k)
        model = KMeans(n_clusters=k, random_state=42)
        model.fit(dataset)
        inertias[k] = model.inertia_

    inertia_df = pd.DataFrame.from_dict(inertias, orient='index').reset_index()
    inertia_df.columns = ['cluster_size', 'inertia']
    
    return inertia_df
    
    
def mfa_method(n_components, dataset, groups):
    mfa = MFA(groups = groups, n_components = n_components, n_iter = 5, random_state = 42)
    return mfa.fit_transform(dataset)
    

def str_normalize(x):
    return unidecode.unidecode(x).lower().replace(' ', '_').split(',')[0].strip()

In [None]:
# Data Load and Prep

## Survey Info

### Load raw data and dump it without question names
survey_raw_df = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')
survey_raw_df.iloc[1:, 1:].to_csv('/kaggle/working/kaggle_survey_2020_responses_no_header.csv', index=False)

### Load data with question reference
survey_df = pd.read_csv('/kaggle/working/kaggle_survey_2020_responses_no_header.csv')

### Generate dataframe with main informations of each question
questions_dict = {k:v[0] for k, v in survey_raw_df.head(1).to_dict().items()}

select_choice = []
for k, v in questions_dict.items():
    if 'Part' in k or 'OTHER' in k:
        select_choice.append(k)

#### Questions with multiple choices
select_choice_dict = {}
for k in select_choice:
    if '_Part' in k:     
        question_key = k.split('_Part')[0]
    elif '_OTHER' in k:     
        question_key = k.split('_OTHER')[0]
    question = questions_dict[k].split(' - ')[0]
    value = questions_dict[k].split(' - ')[-1]
    if question_key in select_choice_dict:
        select_choice_dict[question_key]['columns'][k] = value
    else:
        select_choice_dict[question_key] = {'question':question, 'columns':{k:value}, 'type':'multiple'}

#### Questions with only one choice
multiple_choice_dict={}
multiple_choice = [a for a in questions_dict.keys() if a not in select_choice][1:]
for k in multiple_choice:
    multiple_choice_dict[k] = {'question':questions_dict[k], 'columns':{k:survey_df[k].unique().tolist()}, 'type':'unique'}
    

all_questions_dict = multiple_choice_dict.copy()
all_questions_dict.update(select_choice_dict)

all_questions_df = pd.DataFrame.from_dict(all_questions_dict, orient='index').reset_index()
all_questions_df.loc[:, 'order'] = all_questions_df['index'].apply(lambda x: int(str(x).split('_')[0][1:]))
all_questions_df.sort_values(by='order', inplace=True)

## Identity Info
identity_questions = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']

identity_raw_df = survey_df.loc[:, identity_questions]
identity_raw_df.columns = ['age', 'gender', 'country', 'schooling', 'profession']

identity_raw_df.loc[:, 'country_key'] = identity_raw_df['country'].apply(lambda x: str_normalize(x))

country_rematch = {'IR': 'Iran', 'TW': 'Taiwan', 'KR':'South Korea', 'KP':'Republic of Korea', 'RU': 'Russia'}

countries_infos = pd.read_csv('/kaggle/input/countries-infos/countries_info.csv')

countries_infos.loc[:, 'name_adj'] = countries_infos.apply(lambda x: country_rematch[x['alpha-2']] if x['alpha-2'] in country_rematch else x['name'], axis=1)

countries_infos.loc[:, 'country_key'] = countries_infos['name_adj'].apply(lambda x: str_normalize(x))

identity_df = pd.merge(left=identity_raw_df, 
                       right=countries_infos.loc[:, ['region', 'sub-region', 'country_key']],
                       on='country_key',
                       how='left'
                      )

all_ages = identity_df['age'].sort_values().unique().tolist()
all_ages_dict = {all_ages[i]:i for i in range(0, len(all_ages))}

identity_df.loc[:, 'age_adj'] = identity_df['age'].apply(lambda x: all_ages_dict[x])


## Apparatus Info

app_dict_cols = {
 'Q7': 'lang',
 'Q9': 'ide',
 'Q10': 'notebook',
 'Q11': 'platform',
 'Q12': 'hardware',
 'Q14': 'viz',
 'Q36': 'deploy'
}

### Get dataset ready to clustering
apparatus_raw_df = select_columns(survey_df, app_dict_cols)

### To users who didn't answer a question, attribute them to None option of question    
apparatus_df = complete_nones(apparatus_raw_df)

### 
app_groups = {}
for c in apparatus_df.columns.tolist():
    group = c.split('__')[0] 
    if group in app_groups:
        app_groups[group].append(c)
    else:
         app_groups[group] = [c]

In [None]:
# Clustering

# ## Finding cluster size, with different component sizes
# size = apparatus_df.shape[1]
# pace = math.floor(size/5)
# inertias = []
# for s in range(pace, size, pace):
#     if s <= size:
#         dataset = mfa_method(s, apparatus_df, app_groups)
# #         print('{} colunas - {:.0f}%'.format(s, 100*s/size))
#         df = test_cluster_size(dataset)
#         df.loc[:, 'num_cols'] = s
#         inertias.append(df)
        
# inertia_df = pd.concat(inertias, axis=0)

# fig = px.line(inertia_df, x='cluster_size', y='inertia', facet_col='num_cols')
# fig.update_yaxes(matches=None)
# fig.show()

## Given results, choose cluster size = 9
dataset = mfa_method(14, apparatus_df, app_groups)
model = KMeans(n_clusters=9, random_state=42)
model.fit(dataset)

labels = model.labels_


## Create dataframe with cluster features information
cluster_df = apparatus_df.copy()

cols = cluster_df.columns.tolist()
op_dict = {c:['mean', 'median'] for c in cols}

cluster_df.loc[:, 'cluster_num'] = labels
cluster_df.loc[:, 'counter'] = 1

op_dict['counter'] = 'sum'

grouped = cluster_df.groupby('cluster_num').agg(op_dict)
grouped.columns = ['{}={}'.format(c[0], c[1]) for c in grouped.columns.tolist()]
melt = pd.melt(grouped.reset_index(), id_vars=['cluster_num'], value_vars=grouped.columns.tolist())
melt.loc[:, 'var_type'] = melt['variable'].str.split('=', expand=True).iloc[:, 1]
melt.loc[:, 'var'] = melt['variable'].str.split('=', expand=True).iloc[:, 0]
melt.loc[:, 'dim'] = melt['var'].str.split('__', expand=True).iloc[:, 0]
melt.loc[:, 'dim_value'] = melt['var'].str.split('__', expand=True).iloc[:, 1]

cluster_infos = melt.loc[melt['var_type'] == 'mean'].drop(columns=['variable'])
cluster_infos.loc[:, 'rank'] = cluster_infos.groupby(['var'])['value'].rank(ascending=False)
cluster_infos.loc[:, 'rank_var'] = cluster_infos.groupby(['cluster_num', 'dim'])['value'].rank(ascending=False)
cluster_infos.head()

## Cluster main informations undestanding, to define names
clusters = cluster_infos['cluster_num'].unique().tolist()

all_clusters_summary = []
all_clusters_detail = []

for c in clusters:
    df = cluster_infos.loc[cluster_infos['cluster_num'] == c].sort_values(by='value', ascending=False)
    df.loc[:, 'rank_presence'] = df['value'].rank(ascending=False)
    top = df.loc[(((df['rank_presence'] <= 5) | (df['rank'] == 1)) & (df['value'] >= 0.1)) | (df['rank_var'] == 1)]
    
    group = top.groupby(['dim'])['dim_value'].agg(lambda x: list(x)).to_frame().sort_index()
    
    group.columns = [c]
    all_clusters_summary.append(group)
    all_clusters_detail.append(top)
    
all_clusters_df = pd.concat(all_clusters_summary, axis=1).sort_index()
# all_clusters_df.head(7)

## Find conditions to define name, despite order
cluster_map = {}
for cluster, infos in all_clusters_df.to_dict().items():
    if 'R' in infos['lang']:
        cluster_map[cluster] = 'pirate'
    elif 'ADeepLearningWorkstation' in infos['platform']:
        cluster_map[cluster] = 'brainy'
    elif 'ACloudComputingPlatform' in infos['platform']:
        cluster_map[cluster] = 'cloudy'
    elif 'Matlab' in infos['lang']:
        cluster_map[cluster] = 'mathy'        
    elif 'Bash' in infos['lang']:
        cluster_map[cluster] = 'root'     
    elif 'Tpus' in infos['hardware']:
        cluster_map[cluster] = 'pro'
    elif 'None' in infos['lang']:
        cluster_map[cluster] = 'fresh'
    elif 'Notepad++' in infos['ide']:
        cluster_map[cluster] = 'dev'  
    else:
        cluster_map[cluster] = 'core'


## Aggregate information back to users
cluster_df.loc[:, 'cluster'] = cluster_df['cluster_num'].apply(lambda x: cluster_map[x])

## Bring other information to generate insights and remove users with no complete answers of identity questions
other_info = survey_df.loc[:, ['Q6', 'Q20', 'Q21', 'Q24']]
other_info.columns = ['experience', 'company_size', 'team_size', 'salary']

identity_info = identity_df.loc[:, ['age', 'age_adj', 'gender', 'country', 'schooling', 'profession', 'region', 'sub-region']]

all_cluster_infos = pd.concat([identity_info, other_info, cluster_df], axis=1)
all_cluster_infos.index.name='user_id'

all_cluster_infos['has_empy_answer'] = all_cluster_infos.apply(lambda x: np.nan in [x['experience'], x['schooling'], x['profession']], axis=1)

cluster_infos = all_cluster_infos.loc[~all_cluster_infos['has_empy_answer']].copy()

## Create dict to define order of some categories
experience_dict = {'I have never written code':0, '< 1 years': 1, '1-2 years':2, '3-5 years':3, 
                   '5-10 years': 4, '10-20 years':5, '20+ years':6}

company_size_dict = {'0-49 employees': 0, '50-249 employees': 1, '250-999 employees':2, '1000-9,999 employees':3,
                     '10,000 or more employees':4, 'Not Answered':-1}

cluster_infos.loc[:, 'region'] =  cluster_infos['region'].fillna('Not Disclosed')
cluster_infos.loc[:, 'sub-region'] =  cluster_infos['sub-region'].fillna('Not Disclosed')

cluster_infos.loc[:, 'team_size'] =  cluster_infos['team_size'].fillna('Not Answered')
cluster_infos.loc[:, 'company_size'] =  cluster_infos['company_size'].fillna('Not Answered')
cluster_infos.loc[:, 'salary'] =  cluster_infos['salary'].fillna('Not Answered')

cluster_infos.loc[:, 'exp_order'] = cluster_infos['experience'].apply(lambda x: experience_dict[x])
cluster_infos.loc[:, 'company_order'] = cluster_infos['company_size'].apply(lambda x: company_size_dict[x])
cluster_infos.loc[:, 'team_order'] = cluster_infos['team_size'].apply(lambda x: int(re.split('[-+]', x)[0]) if x != 'Not Answered' else -1)
cluster_infos.loc[:, 'salary_order'] = cluster_infos['salary'].apply(lambda x: int([f for f in re.split('[$-/>]', x) if f.strip()][0].replace(',', '')) if x != 'Not Answered' else -1)

cluster_infos.to_csv('/kaggle/working/cluster_infos.csv', index=True)

In [None]:
# Analysis

## Data Load
cluster_infos = pd.read_csv('/kaggle/working/cluster_infos.csv')#.set_index('user_id')

cluster_infos.loc[:, 'Cluster'] = cluster_infos['cluster'].str.title()


## Color Definitions
cluster_colors = ["ffcad4","17bebb","f0cf65","d62246","2B3A64","647BA6","f3663f","6c0e23","6A449C"]
cluster_key = cluster_infos['cluster'].unique().tolist()
cluster_name = cluster_infos['Cluster'].unique().tolist()

color_dict_key = {cluster_key[i]:'#' + cluster_colors[i] for i in range(0, len(cluster_colors))}
color_dict_key[''] = '#A9A9A9'
color_dict_name = {cluster_name[i]:'#' + cluster_colors[i] for i in range(0, len(cluster_colors))}
color_dict_name[''] = '#A9A9A9'

color_map = {'cluster':color_dict_key, 'Cluster':color_dict_name}

palette = px.colors.qualitative.T10

## Data Prep
cluster_melt = pd.melt(cluster_infos, id_vars=['user_id', 'age', 'age_adj', 'gender', 'country', 'schooling', 
                                               'profession', 'region', 'sub-region', 'experience', 'exp_order', 
                                               'company_size', 'company_order', 'team_size', 'team_order', 
                                               'salary', 'salary_order', 'cluster_num', 'cluster'
                                              ], value_vars = [c for c in cluster_infos.columns.tolist() if '__' in c])


cluster_melt.loc[:, 'dim'] = cluster_melt['variable'].str.split('__', expand=True).iloc[:, 0]
cluster_melt.loc[:, 'dim_value'] = cluster_melt['variable'].str.split('__', expand=True).iloc[:, 1]

In [None]:
def plot(fig):
    fig.update_layout(font_family='Avenir', template='plotly_white', titlefont_size=22)
    fig.update_yaxes(tickfont_size=14)
    fig.show()
    

def fmt_cluster_name(cluster_list, bold=False, italic=False):
    
    new_list = ['{}'.format(f.title().replace('_', ' ')) for f in cluster_list]
    
    if bold and italic:
        new_list = ['<i><b>{}</b></i>'.format(f) for f in new_list]
    elif bold:
        new_list = ['<b>{}</b>'.format(f) for f in new_list]
    elif italic:
        new_list = ['<i>{}</i>'.format(f) for f in new_list]

    return new_list


def split_break(name, upper=True, size=15):
    if upper:
        words = re.split('(?=[A-Z])', name)
    else:
        words = name.split(' ')
    new_text = ''
    for s in words:
        if len(new_text.split('<br>')[-1]) > size:
            new_text += '<br>' + s
        else:
            new_text += ' ' + s
        
    return new_text.strip()

# Who are you at the Toolset Zoo?

##  Intro
When we take a closer look at the Kaggle Survey data, there's a lot of hidden patterns to uncover.  
In this Notebook, I've decided to focus on the apparatus different kagglers use all over the world, what are the common groups that exist and how they are related to other very interesting dimensions such as gender, professional experience and world region.

Let's take a look?

## Defining apparatus and Clustering

First and foremost, let's define what is the Data Apparatus. Considering apparatus the main tools needed to every data application in order to deliver value, the dimensions we'll analyze to each user are:
* Programming Languages (Q7)
* IDE's (Q9)
* Computing Platform (Q11)
* Notebook products (Q10)
* Hardware (Q12)
* Visualization Libraries (Q14)
* Deploy (Q36)

After selecting this data to all users and removing users who didn't answer the questions about coding experience, educational background and profession, we can use Multi Factor Analysis (with Prince) to transform our Categorical variables in Continuous features, and then cluster them with the K-Means algorithm. The details of the clustering step are defined on a cell hidden above.

# Analysis

## There are 9 main groups of users on Kaggle 

In order of proportion among users: **Core, Dev, Pirate, Fresh, Cloudy, Mathy, Pro, Brainy and Root.**

In [None]:
df = cluster_infos\
        .groupby(['Cluster'], as_index=False)\
        .agg({'counter':'sum'})

df.loc[:, 'percentage'] = 100*df['counter']/df['counter'].sum()
 
fig = px.bar(df, y='Cluster', text='percentage', color='Cluster', color_discrete_map=color_map['Cluster'],
            x='counter')
fig.update_layout(
    showlegend=False, 
    yaxis_title='', 
    xaxis_title='Total of users',
    title='Distribution of 9 clusters among all users',
    xaxis_range=[0, 7000]

)
fig.update_yaxes(categoryorder='total ascending')
fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')


plot(fig)

In [None]:
all_langs = cluster_melt.loc[cluster_melt['dim'] == 'lang']\
        .groupby(['dim_value'], as_index=False)\
        .agg({'value':'mean'})

# all_langs.sort_values(by='value', ascending=False)

## What each of these clusters mean?

To define these clusters attributes, let's analyze their Programming Languages preferences.

In [None]:
## Language

lang = cluster_melt.loc[cluster_melt['dim'] == 'lang']\
        .groupby(['cluster', 'dim_value'], as_index=False)\
        .agg({'value':['sum', 'mean'], 'user_id':'count'})

lang.columns = ['cluster', 'dim_value', 'users', 'value', 'total_users']

lang.loc[:, 'name'] = lang['dim_value'].str.title()
lang.loc[:, 'cluster_name'] = lang['cluster'].str.replace('_', '<br>').str.title()

lang.loc[:, 'predominance'] = 100*lang['value']
lang.loc[:, 'rank_cluster'] = lang.groupby(['cluster'])['value'].rank(ascending=False)
lang.loc[:, 'rank_lang'] = lang.groupby(['name'])['value'].rank(ascending=False)

# Option 1
df = lang\
        .loc[lang['rank_cluster'] <= 3]\
        .sort_values(by='rank_cluster')

df.loc[:, 'Language'] = df['name']
# df.loc[:, 'text'] = df.apply(lambda x: '{} - {:.0f}%'.format(x['name'], x['predominance']), axis=1)

cluster_order = {'cluster':['fresh', 'pirate', 'mathy', 'brainy', 'core', 'cloudy', 'root', 'dev', 'pro']}

fig = px.bar(df, x='predominance', y='cluster', facet_col='rank_cluster', color='Language', 
             text='name', facet_col_spacing=0.1,
             category_orders=cluster_order,
             color_discrete_sequence=palette
            )

fig.update_layout(
    showlegend=False, 
    title='Top 3 languages of each cluster',
    height=600,
    margin_t=120
)

fig.update_yaxes(title='', tickvals=cluster_order['cluster'], ticktext=fmt_cluster_name(cluster_order['cluster'], True))

fig.update_xaxes(
    tickvals=[0, 25, 50, 75, 100],
    title='Share of users<br>who know it (%)'
)

fig.update_traces(textposition='auto')

fig.for_each_annotation(lambda a: a.update(text = '<i>Top {:.0f}</i>'.format(float(a.text.split('=')[1]))))              
                
plot(fig)

There are three clusters that standout due to their programming language choices:
* 68% of users from the **Fresh** cluster aren't used to any programming language, or know a little Python
* **Pirates** love R. (95% use it frequently)
* Beyond Python, Matlab is very used by **Mathy** users (more than 86% of users)
* All others have Python and SQL as their two most used languages. 
* Javascript is a relevant language to **Devs** and **Pros** 

In [None]:
# Lang x Cluster
top = lang\
        .loc[lang['rank_lang'] <= 1].copy()

# others = lang\
#             .loc[lang['rank_lang'] > 1]\
#             .groupby(['dim_value'])\
#             .agg({'users':'sum', 'total_users':'sum'})

# others.loc[:, 'total'] = others.groupby(['dim_value'])['users'].transform('sum')
# others.loc[:, 'value'] = others['users']/others['total']

# df = pd.concat([top, others], axis=0)
df = top.copy()

df.loc[:, 'Cluster'] = df['cluster'].str.title()
df.loc[:, 'Language'] = df['name'].str.title()

order = df.sort_values(by='predominance', ascending=False)['Language'].tolist()

fig = px.bar(df.sort_values(by='cluster'), y='predominance', x='Cluster', color='Cluster', color_discrete_sequence=palette
#              text='name', 
             ,facet_col='name', color_discrete_map=color_map['Cluster']
#              ,category_orders={'Cluster':['Core', 'Pirate', 'Fresh','Mathy', 'Pro', 'Root', 'Dev']}
             ,category_orders={'name':['Python', 'R', 'Matlab','C++', 'C','None', 'Sql', 'Javascript','Swift', 'Java', 'Other', 'Bash', 'Julia']}
#              facet_col_wrap=3, facet_col_spacing=0.1
            )

fig.update_layout(
    showlegend=False, title='Group most skilled in each language (proportionally)',
    yaxis_title='Percentage of group<br>that uses it (%)',
    margin_t=100,
#     height=700
    )
fig.update_xaxes(matches=None, 
                 showticklabels=True, 
#                  categoryorder='total descending', 
                 title='')
# fig.update_xaxes(
#     tickvals=[0, 50, 100], 
#     range=[0, 120])

# fig.update_traces(textposition='outside', texttemplate='%{x:.0f}%')
fig.for_each_annotation(lambda a: a.update(text = '<i>{}</i>'.format(a.text.split('=')[1].title())))
plot(fig)

When we look to each language's largest group (proportionally), some other patters emerge:
* The **Core** group is the most Python-skilled
* **Mathy** users are also the most skilled on C and C++, languages known for their speed and scientific usage
* **Root** users are the ones who love Bash the most
* **Pros** are the heaviest users of SQL and Javascript
* **Devs** were the ones who used Java and non-mapped languages the most

In [None]:
melt = cluster_melt.copy()

melt.loc[:, 'count'] = 1
melt.loc[melt['value'] == 0, 'count'] = 0
melt.loc[melt['dim_value'].isin(['None', 'I do not share my work publicly']) , 'count'] = 0

dim_user = melt.copy()\
            .groupby(['user_id', 'cluster', 'dim'], as_index=False)\
            .agg({'count': 'sum'})

dim_cluster = dim_user\
                .groupby(['cluster', 'dim'], as_index=False)\
                .agg({'count':['mean', 'median'], 'user_id':'nunique'})

dim_cluster.columns = ['cluster', 'dim', 'avg_value', 'med_value', 'users']

df = dim_cluster\
        .loc[dim_cluster['dim'].isin(['deploy', 'lang', 'notebook', 'viz'])].copy()

df.loc[:, 'Cluster'] = df['cluster'].str.title()
df.loc[:, 'rank'] = df.groupby(['dim'])['avg_value'].rank(ascending=False)

df.loc[:, 'main'] = ''
df.loc[df['rank'] <= 2, 'main'] = df['Cluster']


fig = px.bar(df, x='avg_value', y='Cluster', facet_col='dim', 
             color='main', color_discrete_map=color_map['Cluster'])

renamer = {'deploy': 'Deploy tools<br>(9 options)', 'lang':'Programming languages<br>(12 options)', 'notebook':'Notebook solutions<br>(13 options)',
            'viz':'Visualization libraries<br>(11 options)'
           }

# lang = 12
# ide = 11
# notebook = 13
# viz = 11
# deploy = 9

fig.update_layout(
    title='Average of user tools, per Cluster - Top 2 highlighted',
    yaxis_title='',
    showlegend=False,
    margin_t=120
)

fig.update_xaxes(title='Average<br>per user')

fig.update_traces(texttemplate='%{x:.1f}')

fig.for_each_annotation(lambda x: x.update(text=renamer[x.text.split('=')[1]]))
plot(fig)

When we look multiple apparatus dimensions at the same time, we see that **Pro** are the users with the most tools under their belt, from deploy to visualization libraries.

In [None]:
## Platform

dim_df = cluster_melt.loc[cluster_melt['dim'] == 'viz']\
        .groupby(['cluster', 'dim_value'], as_index=False)\
        .agg({'value':'mean'})

dim_df.loc[:, 'name'] = dim_df['dim_value']
dim_df.loc[:, 'cluster_name'] = dim_df['cluster'].str.replace('_', '<br>').str.title()

dim_df.loc[:, 'predominance'] = 100*dim_df['value']
dim_df.loc[:, 'rank_cluster'] = dim_df.groupby(['cluster'])['value'].rank(ascending=False)
dim_df.loc[:, 'rank_dim'] = dim_df.groupby(['name'])['value'].rank(ascending=False)


# Option 1
df = dim_df\
        .loc[dim_df['rank_cluster'] <= 3]\
        .sort_values(by='name')

df.loc[:, 'Cluster'] = df['cluster'].str.title()
df.loc[:, 'Lib'] = df['name'].str.title()

# df.loc[:, 'text'] = df.apply(lambda x: '{:.0f}%'.format(x['predominance']), axis=1)

cluster_order = {'Cluster':['Fresh', 'Dev', 'Pirate', 'Core', 'Pro', 'Cloudy', 'Brainy', 'Mathy', 'Root']}

fig = px.bar(df, x='predominance', y='Cluster', color='name', 
             text='Lib', facet_col_spacing=0.05, facet_col='rank_cluster',
             category_orders=cluster_order, color_discrete_sequence=palette
            )

fig.update_layout(
    showlegend=False, 
    title='Top 3 Visualization Libraries of each cluster', 
    yaxis_title='',
    margin=dict(t=120),
    height=600)

fig.update_xaxes(
    range=[0,130], matches=None, tickvals=[0, 25, 50, 75, 100], 
    title='Share of cluster<br>that uses it (%)'
)

fig.update_traces(textposition='outside')

fig.for_each_annotation(lambda a: a.update(text = "Top {:.0f}".format(float(a.text.split('=')[1]))))

plot(fig)

When we dive into each cluster Visualization Tool, we see that:
* **Devs** don't use them as much
* More than half of **Pro** users use at least 3 different vizualiation libraries
* **Core** users really focus on Matplotlib
* **Pirates** know their Ggplot

In [None]:
## Platform

dim_df = cluster_melt.loc[cluster_melt['dim'] == 'platform']\
        .groupby(['cluster', 'dim_value'], as_index=False)\
        .agg({'value':'mean'})

dim_df.loc[:, 'name'] = dim_df['dim_value']
dim_df.loc[:, 'cluster_name'] = dim_df['cluster'].str.replace('_', '<br>').str.title()

dim_df.loc[:, 'predominance'] = 100*dim_df['value']
dim_df.loc[:, 'rank_cluster'] = dim_df.groupby(['cluster'])['value'].rank(ascending=False)
dim_df.loc[:, 'rank_dim'] = dim_df.groupby(['name'])['value'].rank(ascending=False)


# Option 1
df = dim_df\
        .loc[dim_df['rank_cluster'] == 1]\
        .sort_values(by='name')

df.loc[:, 'Platform'] = df['name']
df.loc[:, 'text'] = df.apply(lambda x: '{:.0f}%'.format(x['predominance']), axis=1)

cluster_order = {'cluster':df.sort_values(by=['value'])['cluster'].tolist()}

fig = px.bar(df, y='predominance', x='cluster', color='Platform', 
             text='text', facet_col_spacing=0.05, facet_col='Platform',
             category_orders=cluster_order, color_discrete_sequence=palette
            )

fig.update_layout(
    showlegend=False, 
    title='Main computing platform used by each cluster', 
    yaxis_title='Share of cluster<br>that uses it (%)',
    margin=dict(t=120),
    yaxis_range=[0,110],
    height=600)

fig.update_xaxes(title='', tickvals=cluster_order['cluster'], ticktext=fmt_cluster_name(cluster_order['cluster'], True))
fig.update_xaxes(matches=None, tickangle=45, categoryorder='total descending', tickfont_size=14)

fig.update_traces(textposition='outside')

# fig.for_each_annotation(lambda a: a.update(text = split_break(a.text.split('=')[1])))
fig.for_each_annotation(lambda a: a.update(text = f"<b>{split_break(a.text.split('=')[1])}</b>"))

plot(fig)

Looking at each cluster computing platform preferences we see that:
* Nearly 100% of **Brainy** uses a Deep Learning Workstation 
* Almost 100% of **Cloudy** uses a Cloud Computing Platform
* Other groups rely on their own computers **(Core, Pirate, Mathy, Dev and Pro)**, other type of platform **(Root)** or really just didn't start to code **(Fresh).**

In [None]:
## Dimensions

dim_df = cluster_melt.loc[cluster_melt['dim'] == 'hardware']\
        .groupby(['cluster', 'dim_value'], as_index=False)\
        .agg({'value':'mean'})

dim_df.loc[:, 'name'] = dim_df['dim_value']
dim_df.loc[:, 'Cluster'] = dim_df['cluster'].str.title()

dim_df.loc[:, 'predominance'] = 100*dim_df['value']
dim_df.loc[:, 'rank_cluster'] = dim_df.groupby(['Cluster'])['value'].rank(ascending=False)
dim_df.loc[:, 'rank_dim'] = dim_df.groupby(['name'])['value'].rank(ascending=False)

# Option 1
df = dim_df\
#         .loc[dim_df['rank_cluster'] == 1]\
#         .sort_values(by='rank_cluster')

df.loc[:, 'IDE'] = df['name']

df.loc[:, 'main'] = ''
df.loc[df['rank_dim'] == 1, 'main'] = df['Cluster']

# df.loc[:, 'text'] = ''
# df.apply(lambda x: '{} - {:.0f}%'.format(x['name'], x['predominance']), axis=1)

cluster_order = {'IDE':['Gpus', 'Tpus', 'Other', 'None']}

fig = px.bar(df, x='predominance', y='Cluster', color='main', 
             facet_col_spacing=0.1, facet_col='IDE',
             category_orders=cluster_order, color_discrete_map=color_map['Cluster'],
            )

fig.update_layout(
    showlegend=False, 
    title='Hardware usage of each cluster, main group highlight', 
    height=500
)

fig.update_yaxes(title='', categoryorder='total ascending')

fig.update_xaxes(
#     matches=None,
#     tickvals=[0, 25, 50, 75, 100], 
    title='Share of cluster<br>that uses it (%)'
)


fig.update_traces(texttemplate='%{x:.0f}%', textposition='auto')
fig.for_each_annotation(lambda a: a.update(text = '<b>{}</b>'.format(a.text.split('=')[1])))
plot(fig)

When considering hardware usage, we see that:
* **Brainy** users are the biggest fans of GPUs (which are great for running Deep Learning)
* **Pro** users are the heaviest users of TPU
* **Root** go wild again with other hardware choices

In [None]:
## Dimensions

dim_df = cluster_melt.loc[cluster_melt['dim'] == 'ide']\
        .groupby(['cluster', 'dim_value'], as_index=False)\
        .agg({'value':'mean'})

dim_df.loc[:, 'name'] = dim_df['dim_value']
dim_df.loc[:, 'cluster_name'] = dim_df['cluster'].str.replace('_', '<br>').str.title()

dim_df.loc[:, 'predominance'] = 100*dim_df['value']
dim_df.loc[:, 'rank_cluster'] = dim_df.groupby(['cluster'])['value'].rank(ascending=False)
dim_df.loc[:, 'rank_dim'] = dim_df.groupby(['name'])['value'].rank(ascending=False)

# Option 1
df = dim_df\
        .loc[dim_df['rank_cluster'] == 1]\
        .sort_values(by='rank_cluster')

df.loc[:, 'IDE'] = df['name']
# df.loc[:, 'text'] = df.apply(lambda x: '{} - {:.0f}%'.format(x['name'], x['predominance']), axis=1)

cluster_order = {'cluster':df.sort_values(by='value')['cluster'].tolist()}

fig = px.bar(df, x='predominance', y='cluster', color='IDE', 
             text='IDE', facet_col_spacing=0.1,
             category_orders=cluster_order, color_discrete_sequence=palette
            )

fig.update_layout(
    showlegend=False, 
    title='Main IDE used by each group', 
    height=500
)

fig.update_yaxes(title='', tickvals=cluster_order['cluster'], ticktext=fmt_cluster_name(cluster_order['cluster'], True))

fig.update_xaxes(
    tickvals=[0, 25, 50, 75, 100], 
    range=[0, 130],
    title='Share of cluster<br>that uses it (%)'
)

   
fig.update_layout(margin_r=100)

text = '''
<b>Jupyter</b> is the most common choice,<br>
but <b>Devs</b> prefer others IDE's.<br>
Others groups opt for<br>
their respective language IDE's.
'''
fig.add_annotation(
                text=text,
                align='right',
                showarrow=False,
                xref='paper',
                yref='paper',
                x=1.1,
                y=1,
                font_size=13
            )


fig.update_traces(textposition='outside')
# fig.for_each_annotation(lambda a: a.update(text = '<i>Top {:.0f}</i>'.format(float(a.text.split('=')[1]))))
plot(fig)

Here, we see that:
* As expected, **Mathy** and **Pirates** use Matlab and RStudio, respectively as their main IDE's
* **Devs** really like their own choices of IDE
* The other groups, apart from the **Fresh** one, opt for the Jupyter Suite

### How do theses clusters relate to the users personal information?

In [None]:

df = cluster_infos\
        .groupby(['Cluster', 'age'], as_index=False)\
        .agg({'user_id':'nunique', 'age_adj':'mean'})

df.columns = ['Cluster', 'age_group', 'num_users', 'age_order']

df.loc[:, 'total_users'] = df.groupby(['Cluster'])['num_users'].transform('sum')

df.loc[:, 'percentage'] = 100*df['num_users']/df['total_users']

df.loc[:, 'rank'] = df.groupby(['age_group'])['percentage'].rank(ascending=False)

df.loc[:, 'text'] = ''
df.loc[df['rank'] == 1, 'text'] = df['Cluster']

top = df.loc[df['rank'] <= 1]

fig = px.bar(top.sort_values(by='age_order'), x='age_group', y='percentage', color='Cluster', 
            color_discrete_map=color_map['Cluster'], text='Cluster')

fig.update_layout(
    title='Cluster with largest proportion on age group',
    xaxis_title='Age group',
    yaxis_title='Share of users of cluster (%)'
)

fig.update_traces(textposition='outside')

plot(fig)

# fig = px.bar(df, y='Cluster', x='percentage', color='text', facet_col='age_group', 
#             color_discrete_map=color_map['Cluster'])

# fig.update_layout(showlegend=False)
# fig.for_each_annotation(lambda a: a.update(text = '<b>{}</b>'.format(a.text.split('=')[1])))
# fig.

# fig = px.bar(top.sort_values(by='age_order'), y='cluster', x='percentage', facet_col='age_group')
# plot(fig)

As we look to age distribution, we can see that, considering the distribution inside the cluster:
* **Mathy**s have the youngest users
* **Core** and **Pro** users have more 20-somethings
* **Brainy**s predominance from 30-34 shows that Deep Learning takes experience
* **Root** users were here before everyone (with largest proportion from 35 to 59 years old) 

In [None]:
df = cluster_infos\
        .groupby(['Cluster', 'experience'], as_index=False)\
        .agg({'user_id':'nunique', 'exp_order':'mean'})\
        .sort_values(by='exp_order')

df.columns = ['Cluster', 'dim', 'num_users', 'order']

df.loc[:, 'total_users'] = df.groupby(['Cluster'])['num_users'].transform('sum')

df.loc[:, 'percentage'] = 100*df['num_users']/df['total_users']

df.loc[:, 'rank'] = df.groupby(['dim'])['percentage'].rank(ascending=False)

top = df.loc[df['rank'] <= 1]

fig = px.bar(top, y='dim', x='percentage', color='Cluster', text='Cluster', 
             facet_col='rank', color_discrete_map=color_map['Cluster'])
fig.update_yaxes(showticklabels=True)
fig.update_xaxes(ticksuffix='%')

fig.update_traces(texttemplate='<b>%{text}</b>', textposition='outside')
fig.update_layout(
    yaxis_title='',
    xaxis_title='Share of users on cluster',
    xaxis_range=[0,70],
    title='Top cluster of each coding experience level',
    showlegend=False
)

fig.for_each_annotation(lambda a: a.update(text=''))
plot(fig)

When we investigate coding experience:
* **Root** users have been working for very long (10-20+ years)
* Again, Deep Learning is most commmon among users with greater experience (5-10 years)
* Cloud is popular with the more recent crowd (3-5 years)
* **Core** users are fairly new, but as the name says, are good at the basics
* **Fresh** users are the ones with least coding experience


In [None]:
# Gender x cluster 
df = cluster_infos\
        .groupby(['Cluster', 'gender'], as_index=False)\
        .agg({'user_id':'nunique'})

df.columns = ['Cluster', 'gender', 'num_users']

df.loc[:, 'total_users'] = df.groupby(['Cluster'])['num_users'].transform('sum')

df.loc[:, 'percentage'] = 100*df['num_users']/df['total_users']

df.loc[:, 'rank'] = df.groupby(['gender'])['percentage'].rank(ascending=False)

df.loc[:, 'text'] = ''
df.loc[df['rank'] <= 1, 'text'] = df['Cluster']


fig = px.bar(df.sort_values(by=['gender','percentage']), y='Cluster', x='percentage', facet_col='gender',
             color='text', color_discrete_map=color_map['Cluster'], 
             color_discrete_sequence=palette, text='text',
             category_orders={'gender':['Man', 'Woman', 'Nonbinary', 'Prefer to self-describe', 'Prefer not to say']}
            )
fig.update_xaxes(matches=None, ticksuffix='%', title='Share of users<br>on cluster')
fig.for_each_annotation(lambda a: a.update(text=f"<b>{a.text.split('=')[1]}</b>"))
fig.update_layout(
    title='Gender distribution highlights - Top cluster',
    yaxis_title='',
    showlegend=False)

plot(fig)

* Among all clusters, the gender gap is the highest on Deep Learning (**Brainy**)
* Women are coming on strong to the field, with the **Fresh** cluster (27%) having the largest proportion of them among all clusters
* **Root** users have the largest participation of non-binary respondants

In [None]:
## Education background
dim_df = cluster_infos\
        .groupby(['cluster', 'schooling'], as_index=False)\
        .agg({'user_id':'nunique'})

dim_df.columns = ['cluster', 'dim', 'num_users']

dim_df.loc[:, 'total_users_dim'] = dim_df.groupby(['dim'])['num_users'].transform('sum')
dim_df.loc[:, 'total_users'] = dim_df['num_users'].sum()


dim_df.loc[:, 'percentage_dim'] = 100*dim_df['num_users']/dim_df['total_users_dim']
dim_df.loc[:, 'percentage_gen'] = 100*dim_df['total_users_dim']/dim_df['total_users']

dim_df.loc[:, 'rank_cluster'] = dim_df.groupby(['cluster'])['percentage_dim'].rank(ascending=False, method='first')
dim_df.loc[:, 'rank_dim'] = dim_df.groupby(['dim'])['percentage_dim'].rank(ascending=False, method='first')

df = dim_df\
        .loc[dim_df['rank_dim'] <= 2]\
        .sort_values(by=['rank_dim', 'percentage_dim'])

df.loc[:, 'Cluster'] = fmt_cluster_name(df['cluster'].tolist())
df.loc[:, 'text'] = df.apply(lambda x: '<b>{}</b> ({:.0f}%)'.format(x['Cluster'], x['percentage_dim']), axis=1)

df.loc[:, 'dim_adj'] = df['dim'].apply(lambda x: split_break(x, False))

fig = px.bar(df, x='percentage_dim', y='dim_adj', facet_col='rank_dim', color='Cluster'
             ,text='text', facet_col_spacing=0.1, color_discrete_map=color_map['Cluster']
            )

fig.update_layout(
    showlegend=False, 
    title='Top 2 groups of each educational background', 
    margin_t=120,
    height=700)

fig.update_yaxes(categoryorder='total descending', row=1, col=1, title='')
                 
fig.update_xaxes(
    showticklabels=False,
    showgrid=False,
#     range=[0, 75],
    title='Share of users<br>on cluster (%)'
)

fig.update_traces(textposition='auto')
fig.for_each_annotation(lambda a: a.update(text = '<i>Top {:.0f}</i>'.format(float(a.text.split('=')[1]))))
plot(fig)

* **Devs** don't need a degree (They are the top 1 group among those who didn't have formal education past high school and top 2 among those who didn't finish college.)
* Academia is apparently a **Pirate** training, with this cluster being the second most common among users with Master's or Doctoral degrees.

In [None]:
## Professional background
dim_df = cluster_infos\
        .groupby(['cluster', 'profession'], as_index=False)\
        .agg({'user_id':'nunique'})

dim_df.columns = ['cluster', 'dim', 'num_users']

dim_df.loc[:, 'total_users_dim'] = dim_df.groupby(['dim'])['num_users'].transform('sum')
dim_df.loc[:, 'total_users'] = dim_df['num_users'].sum()


dim_df.loc[:, 'percentage_dim'] = 100*dim_df['num_users']/dim_df['total_users_dim']
dim_df.loc[:, 'percentage_gen'] = 100*dim_df['total_users_dim']/dim_df['total_users']

dim_df.loc[:, 'rank_cluster'] = dim_df.groupby(['cluster'])['percentage_dim'].rank(ascending=False, method='first')
dim_df.loc[:, 'rank_dim'] = dim_df.groupby(['dim'])['percentage_dim'].rank(ascending=False, method='first')

df = dim_df\
        .loc[dim_df['rank_dim'] <= 2]\
        .sort_values(by=['rank_dim', 'percentage_dim'])

df.loc[:, 'Cluster'] = fmt_cluster_name(df['cluster'].tolist())
df.loc[:, 'text'] = df.apply(lambda x: '<b>{}</b> ({:.0f}%)'.format(x['Cluster'], x['percentage_dim']), axis=1)

df.loc[:, 'dim_adj'] = df['dim'].apply(lambda x: split_break(x, False))

fig = px.bar(df, x='percentage_dim', 
             y='dim_adj', facet_col='rank_dim', color='Cluster'
             ,text='text', facet_col_spacing=0.1, 
             color_discrete_map=color_map['Cluster'],
             category_orders={'dim_adj':['Statistician','Currently not employed', 
                                         'Data Analyst', 'Business Analyst','Data Scientist',
                                        'Research Scientist', 'Data Engineer', 'Machine Learning<br>Engineer',
                                         'Product/Project<br>Manager', 'Other', 'Student', 'Software Engineer',
                                         'DBA/Database Engineer']}
            )

fig.update_layout(
    showlegend=False, 
    title='Top 2 groups of each profession', 
    margin_t=120,
    height=800,
    yaxis_title='Profession'
)

fig.update_yaxes(
    tickvals=[
        'Statistician','Currently not employed', 
                                         'Data Analyst', 'Business Analyst','Data Scientist',
                                        'Research Scientist', 'Data Engineer', 'Machine Learning<br>Engineer',
                                         'Product/Project<br>Manager', 'Student', 'Other', 'Software Engineer',
                                         'DBA/Database Engineer'   
    ]
#     categoryorder='total descending', row=1, col=1, title='')
)

fig.update_xaxes(
    showticklabels=False,
    showgrid=False,
#     range=[0, 75],
    title='Share of professionals<br>on cluster (%)'
)

fig.update_traces(textposition='auto')
fig.for_each_annotation(lambda a: a.update(text = '<i>Top {:.0f}</i>'.format(float(a.text.split('=')[1]))))
plot(fig)


* Statisticias are mainly **Pirates**
* Not employed users are rocking that **Core** skills
* The most Analytics and Data Science oriented jobs (Data Analyst, Business Analyst, Data Scientist and Research Scientists) are really rocking that **Core** and **Pirate** toolset
* The most DataOps jobs (Data Engineer and Machine Learning Engineering) are really keen on the **Cloudy** toolset
* Software Engineers and DBA's are mainly fond of the **Dev** toolset
* When in doubt of how to enter the market, PMs and Students are going mainly to the **Core** and **Dev** toolsets.

## Where does each cluster thrives?

In [None]:
## Continent
dim_df = cluster_infos\
        .groupby(['cluster', 'region'], as_index=False)\
        .agg({'user_id':'nunique'})

dim_df.columns = ['cluster', 'dim', 'num_users']

dim_df.loc[:, 'total_users_dim'] = dim_df.groupby(['dim'])['num_users'].transform('sum')
dim_df.loc[:, 'total_users_cluster'] = dim_df.groupby(['cluster'])['num_users'].transform('sum')

dim_df.loc[:, 'total_users'] = dim_df['num_users'].sum()


dim_df.loc[:, 'percentage_dim'] = 100*dim_df['num_users']/dim_df['total_users_dim']
dim_df.loc[:, 'percentage_gen'] = 100*dim_df['total_users_dim']/dim_df['total_users']

dim_df.loc[:, 'rank_cluster'] = dim_df.groupby(['cluster'])['percentage_dim'].rank(ascending=False, method='first')
dim_df.loc[:, 'rank_dim'] = dim_df.groupby(['dim'])['percentage_dim'].rank(ascending=False, method='first')

df = dim_df.copy()\
        .loc[dim_df['rank_cluster'] <= 1]
#         .sort_values(by=['rank_dim', 'percentage_dim'])

df.loc[:, 'Cluster'] = fmt_cluster_name(df['cluster'].tolist())

df.loc[:, 'main'] = ''
df.loc[df['rank_cluster'] == 1, 'main'] = df['Cluster']

df.loc[:, 'dim_adj'] = df['dim'].apply(lambda x: split_break(x, False))

category_orders={'dim_adj':['Africa', 'Oceania', 'Europe', 'Not Disclosed']}

fig = px.bar(df, y='percentage_dim', 
             x='Cluster', color='main', facet_col='dim_adj'
             ,text='num_users',
             category_orders=category_orders,
             color_discrete_map=color_map['Cluster']
            )

fig.update_layout(
    showlegend=False, 
    title='Continent with the largest proportion of each cluster', 
    margin_t=120,
    yaxis_title='Proportion of cluster<br> on region (%)'
)

fig.update_yaxes(showgrid=False, showticklabels=False)

fig.update_xaxes(
    matches=None,
    title=''
)

fig.update_traces(texttemplate='%{y:.0f}%', textposition='outside')
fig.for_each_annotation(lambda a: a.update(text = '<b>{}</b>'.format(a.text.split('=')[1])))
plot(fig)

* Africa is coming strong, ruling the **Core** and **Pro** game
* Oceania is wild, with the largest proportion of **Devs** and **Pirates**
* Europe goes beyond its laptop, with lots of **Brainys** and **Cloudys**
* **Fresh, Mathy and Root** coming from little places everywhere (so little that their location data is not disclosed)

In [None]:
## Professional background
dim_df = cluster_infos\
        .groupby(['cluster', 'sub-region'], as_index=False)\
        .agg({'user_id':'nunique'})

dim_df.columns = ['cluster', 'dim', 'num_users']

dim_df.loc[:, 'total_users_dim'] = dim_df.groupby(['dim'])['num_users'].transform('sum')
dim_df.loc[:, 'total_users_cluster'] = dim_df.groupby(['cluster'])['num_users'].transform('sum')

dim_df.loc[:, 'total_users'] = dim_df['num_users'].sum()


dim_df.loc[:, 'percentage_dim'] = 100*dim_df['num_users']/dim_df['total_users_dim']
dim_df.loc[:, 'percentage_gen'] = 100*dim_df['total_users_dim']/dim_df['total_users']

dim_df.loc[:, 'rank_cluster'] = dim_df.groupby(['cluster'])['percentage_dim'].rank(ascending=False, method='first')
dim_df.loc[:, 'rank_dim'] = dim_df.groupby(['dim'])['percentage_dim'].rank(ascending=False, method='first')

df = dim_df.copy()\
        .loc[dim_df['rank_cluster'] <= 1]
#         .sort_values(by=['rank_dim', 'percentage_dim'])

df.loc[:, 'Cluster'] = fmt_cluster_name(df['cluster'].tolist())

df.loc[:, 'main'] = ''
df.loc[df['rank_cluster'] == 1, 'main'] = df['Cluster']

df.loc[:, 'dim_adj'] = df['dim'].apply(lambda x: split_break(x, False))

category_orders={'dim_adj':['Sub-Saharian Africa','Northern Africa', 'Australia and New Zealand', 
                            'Western Europe', 'Eastern Asia', 'South-eastern Asia', 'Latin America and the Caribbean',
                            'Disclosed']}

fig = px.bar(df, y='percentage_dim', 
             x='Cluster', color='main', facet_col='dim_adj'
             ,text='num_users',
             category_orders=category_orders,
             color_discrete_map=color_map['Cluster']
            )

fig.update_layout(
    showlegend=False, 
    title='Sub-region with the largest proportion of each cluster', 
    margin_t=150,
    yaxis_title='Proportion of cluster<br> on sub-region(%)'
)

fig.update_yaxes(showgrid=False, showticklabels=False, range=[0,50])

fig.update_xaxes(
    matches=None,
    title=''
)    
    

fig.update_traces(texttemplate='%{y:.0f}%', textposition='outside')
fig.for_each_annotation(lambda a: a.update(text = '<b>{}</b>'.format(split_break(a.text.split('=')[1], False, 5))))
plot(fig)


When we investigate sub-regions inside each continent, the picture shifts:
* Asia is ruling the Deep Learning and newcomer game **(Brainy and Fresh)**;
* Africa is a highlight to **Core** and **Mathy**; 
* **Cloudy** and **Root** focus on Western Europe;
* Latin America land of the **Pro** users

# TL; DR

## Wanna know what is the toolset closest to yours?
## To make it easy for you to find it, here's a quick recap on the main findings!

* Has never written code? **Fresh**
* Loves R? **Pirate**
* Uses a lot of Matlab and C/C++? **Mathy**
* Python running in the Cloud? **Cloudy**
* Python to Deep Learning? **Brainy**
* Focusing on that Python + Jupyter + Matplotlib combo? **Core**
* Jack of all trades, uses a lot of Python and SQL and knows a lot of tools to visualization and deploy? **Pro**
* Knows Python and other language like Javascript or Java, picky with IDEs and don't usually work with visualizations? **Dev**
* Not running things locally, but also not in the cloud? (and probably with some Bash?) **Root**