In [None]:
! pip install pycountry_convert

The challenge objective: tell a data story about a subset of the data science community represented in this survey, through a combination of both narrative text and data exploration. A “story” could be defined any number of ways, and that’s deliberate. The challenge is to deeply explore (through data) the impact, priorities, or concerns of a specific group of data science and machine learning practitioners. That group can be defined in the macro (for example: anyone who does most of their coding in Python) or the micro (for example: female data science students studying machine learning in masters programs). This is an opportunity to be creative and tell the story of a community you identify with or are passionate about! 

# Introduction

The Challenge is to tell a story rather than focus on any results. Here we try to tell a story on based on Continents, Gender and Age.

Few interesting ideas that we had but havent been implemented yet 
- Graph signal processing and identification of subgroups and hidden relationships
- Subspace representation of data using GNN and then use it as a recomender system

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from os import path, makedirs
from matplotlib import pyplot as plt
import plotly.express as px
import seaborn as sns
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2
from geopy.geocoders import Nominatim
import bqplot as bq

import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.express as px
pio.templates.default = "plotly_white"

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 5000)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
geolocator = Nominatim(user_agent='nom123')
def geolocate(country):
    
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan
    
def get_continent(col):
    """ Country to Continent"""
    try:
        cn_continent = country_alpha2_to_continent_code(col)
    except:
        cn_continent = 'Unknown' 
    return cn_continent

def get_country(col):
    """ Country code"""
    try:
        cn_a2_code =  country_name_to_country_alpha2(col)
    except:
        cn_a2_code = 'Unknown' 
    return cn_a2_code

In [None]:
import re
df = pd.read_csv('/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', low_memory=False)

questions_list = {k: v  for k, v in df.iloc[0].to_dict().items()}
column_remap = {}
multiple_questions = []
single_questions = []
for k, v in df.iloc[0].to_dict().items():
    if 'Part' in k:
        multiple_questions.append(k.split('_')[0])
        column_remap[k] = re.sub('[^A-Za-z0-9]+_', '',f'{"_".join(k.split("_")[:-2])}_{v.split("-")[-1].strip()}')
    else:
        column_remap[k] = k
        single_questions.append(k)

multiple_questions = np.unique(multiple_questions)
column_remap['Time from Start to Finish (seconds)'] = 'Duration'
df.rename(columns = column_remap, inplace=True)
df = df[1:]

df['Duration'] = df['Duration'].astype('int')
df['Duration'] = np.floor(df['Duration']/60)
df['Country'] = df['Q3'].apply(get_country)
df['Continent'] = df['Country'].apply(get_continent)
#df['Centroid'] = df['Country'].apply(geolocate)
#df['Lat'] = df['Centroid'].apply(lambda x: x[0])
#df['Lon'] = df['Centroid'].apply(lambda x: x[1])

education_remap = {'Bachelor’s degree': 'BSc', 'Master’s degree': 'MSc', 'Doctoral degree':'PhD',
       'I prefer not to answer':'unknown',
       'Some college/university study without earning a bachelor’s degree':'No-Degree',
       'No formal education past high school': 'High School', 'Professional doctorate': 'P-PhD'}

df['Q4'].replace(education_remap, inplace=True)
df['UserId'] = [f'user{i}' for i in range(df.shape[0])]
for col in multiple_questions:
    columns = list(df.columns[df.columns.str.contains(col)])
    df[columns] = df[columns].notnull().astype(int)
    
print('Number of Users', df.shape[0])
print('Number of countries', df['Q3'].nunique())


In [None]:
#questions_list

In [None]:
question_dict = {}
for k, v in questions_list.items():
    if 'Part' in k:
        k = '_'.join(k.split('_')[:-2])
    if k in question_dict:
        continue
    else:
        question_dict[k] = v.split('-')[0]
#question_dict

In [None]:
pd.DataFrame.from_dict(question_dict, orient='index').T

# General Properties

- 90p of the people finished the survey within an hour, on an average <10mins was taken by majority of the people


In [None]:
pd.DataFrame.from_dict({i:round(df['Duration'].quantile(i),2) for i in np.arange(0, 1, .02)}, 
                       orient='index').iplot(kind='scatter', theme='white', yTitle='Duration', xTitle='Quantile', logy=True)
df[df['Duration']<60]['Duration'].iplot(kind='hist', bins=100, theme='white')

- Asia accounts for more users but this can be attributed to large user pool, comparison is only meaningful when we compare relative numbers
- Younger demographic(~60%) is using kaggle more than the older
- Men: Women :: 1:4
- Majority of the users have a bachelors or masters or doctrate
- Students are major users, which fits with previously observed demographic and also shows that kaggle is a playground for new users and also a platform where people come to learn
- 35% dont use ML which could indicate them using the platform as a learning and playground setup, which is also reflected in their coding experience

In [None]:
for col in ['Continent', 'Q1', 'Q2', 'Q4', 'Q5', 'Q6', 'Q15']:
    df_ = df.groupby(col)[['UserId']].count().sort_values(by='UserId')/df.shape[0]
    fig = px.bar(df_.T,orientation='h', labels={'value':'', 'index':''},
                 text=[[round(x[0],2) for x in df_.values]], title=question_dict[col] if col in question_dict else col)
    fig.update_layout(legend=dict( yanchor="bottom",y=-0.3,xanchor="left",x=0.01,orientation="h"))
    fig.show()

In [None]:
#px.bar(df.groupby('Q3')['UserId'].count().sort_values(), orientation='h')

In [None]:
#px.choropleth(df.groupby('Q3')['UserId'].count().reset_index(), locations='Q3', 
              #locationmode='country names', color='UserId')

- Suprisingly EU has lower Women participation, when compared to NA, Asia and Africa
- SA has the least women participation
- SA have the least users who didnt wished to disclose their gender or non-binary gender, shows a relactance to accept third gender

In [None]:
df_gender = (df.groupby('Q2')['UserId'].count()/df.shape[0]).reset_index()

fig = px.pie((df.groupby('Q2')['UserId'].count()/df.shape[0]).reset_index(), values='UserId', 
       names='Q2',title='Gender Distribution',  height=300)
fig.show()

df_continent_gender = df.groupby(['Continent', 'Q2'])['UserId'].count().reset_index()
df_continent_gender['total'] = df_continent_gender.groupby(['Continent'])['UserId'].transform('sum')
df_continent_gender['UserId'] = df_continent_gender['UserId']/df_continent_gender['total']
fig = px.bar(df_continent_gender.sort_values('Continent'), y='Continent', x='UserId', color='Q2', 
        title='Continent vs Gender Distribution')
fig.show()

#fig = make_subplots(rows=3, cols=3,specs=[[{}, {}, {}], [{},{}, {}], [{},{},{}]])
#fig.add_trace(,row=1, col=1)
#fig.show()


In [None]:
#df_country_gender =  df.groupby(['Q3', 'Q2'])['UserId'].count().reset_index()
#df_country_gender['total'] = df_country_gender.groupby(['Q3'])['UserId'].transform('sum')
#df_country_gender['UserId'] = df_country_gender['UserId'] /df_country_gender['total']
#
#px.bar(df_country_gender[df_country_gender['Q2'].isin(['Man', 'Woman'])].sort_values('Q3'), y='Q3', x='UserId', 
#       facet_col='Q2', facet_col_wrap=2, height=500)


In [None]:
#for i, group in df_country_gender.groupby('Q2'):
 #   fig = px.choropleth(group, locations='Q3', locationmode='country names', color='UserId', title=i)
 #   fig.show()

# Analysis of Questions where only one option can be selected

In [None]:
def plot_gender_conti(df , col, p_analysis_col=['Continent', 'Q2']):
    fig = px.bar((df.groupby(col)['UserId'].count()/df.shape[0]).reset_index().sort_values('UserId'), x='UserId', 
           y=col,title=question_dict[col],  height=300)
    fig.show()
    
    # distribution across continents conditioned on col
    for col_ in p_analysis_col:
        df_ = df.groupby([col_, col])['UserId'].count().reset_index()
        df_['total'] = df_.groupby([col])['UserId'].transform('sum')
        df_['UserId'] = df_['UserId']/df_['total']
        fig = px.bar(df_, y=col, x='UserId', color=col_, barmode='relative', 
                     color_discrete_sequence=px.colors.qualitative.Dark24)
        fig.show()
    
    # Distribution of users conditioned on a continent
    df_continent = df.groupby(['Continent', col])['UserId'].count().reset_index()
    df_continent['total'] = df_continent.groupby(['Continent'])['UserId'].transform('sum')
    df_continent['UserId'] = df_continent['UserId']/df_continent['total']

    fig, axes = plt.subplots(1,2, figsize=(26, 6))    
    sns.heatmap(df_continent.pivot(columns=col, values='UserId', index='Continent').fillna(0), ax=axes[0], 
                annot=True, fmt='.2f')
    
    # Distribution of users conditioned on gender
    df_gender = df.groupby(['Q2', col])['UserId'].count().reset_index()
    df_gender['total'] = df_gender.groupby(['Q2'])['UserId'].transform('sum')
    df_gender['UserId'] = round(df_gender['UserId']/df_gender['total'],2)
    sns.heatmap(df_gender.pivot(columns=col, values='UserId', index='Q2').fillna(0), ax=axes[1], annot=True, fmt='.2f')
    
    #df_ = df.groupby(['Q2', col])['UserId'].count().reset_index()
    #df_['total'] = df_.groupby([col])['UserId'].transform('sum')
    #df_['UserId'] = df_['UserId']/df_['total']
    #fig = px.bar(df_, y=col, x='UserId', color='Q2', barmode='relative', orientation='h')
    #fig.show()
    #fig.show()
    
    
    # distribtion of users conditioned on gender and education
    df_ = df.groupby(['Continent','Q2', col])['UserId'].count().reset_index()
    df_['total'] = df_.groupby([col, 'Q2'])['UserId'].transform('sum')
    df_['UserId'] = df_['UserId']/df_['total']
    fig = px.bar(df_, x=col, y='UserId', color='Continent', facet_col='Q2', facet_col_wrap=4, barmode='relative')
    fig.show()
    
    

## Education 

- Majority of the users have a degree(92%)
- Asia leads in all metrics, this can be attibuted to larger population of countries in asia. Number would make sense only when considered relative to their overall population
- Majority of the users in Europe have masters
- Majority of the users in Asia has masters or bachelors
- Asia and Africa lags in number of PhD and Masters users
- Looking at users with highschool degrees, NA has the least users, where as EU has more users. This can be attribted to the programs like real schule or study with work in germany 

In [None]:
plot_gender_conti(df, 'Q4', ['Continent', 'Q5'])

## Age

- Asia and Africa has more younger users than others
- % of Asia and Africa decreases with the age, showing a younger population and a growing economy
- Women participation is increasing in younger demographics, showing influence of policies and changing times


In [None]:
plot_gender_conti(df, 'Q1', ['Continent', 'Q2'])

## Role

- Students and Data scientist are the major users of the platform
- A lot of students are from Asia, but when it comes to professional roles, Europe and NA takes a lead, showing early adoption of tech in these area.
- Majority of the research engineers have a PhD
- MSc has majorty contribution in many roles
- Non coders are majorly grouped in management , product owner and business analyst roles

In [None]:
plot_gender_conti(df, 'Q5', ['Continent', 'Q4', 'Q6'])

## Coding Experience

- Asia has more Novice coders when compared to EU and NA
- Women participation increases with decreasing coding experience, showing impact of policies to include women in STEM 

In [None]:
plot_gender_conti(df, 'Q6', ['Continent','Q2'])

## ML Experience 

- EU and NA has more experienced users, confirms early adoption of ML 
- Asia has the most users who havnt used ML, could be emphasis on software tech and consultancy industry overpowering R&D 
- Looking only at women users with >10 yr exp, EU and NA has more experineced users when compared to Asia. Asia's % increases with decreasing expereince , showing increasing popularity of ML among women
- ML expereince is mainly restricted to research, student and data science fields
- ML methods are more popular among younger people

In [None]:
plot_gender_conti(df, 'Q15', ['Continent','Q2', 'Q5', 'Q1'])

- 13% of compensations are < 1000, which could be due to starter jobs in Asia and student jobs which usually pay around 1000
- $> 200k users from NA start to dominate, could be due to salary limits in EU and lower salary in Asia and other countries
- Except for NA and Africa , no other continent has women users earning >500k, which highlights gender pay descrimination E
- EU dominates in pay scale 30-80k, which is the ball park region of majority of the salaries in EU


In [None]:
plot_gender_conti(df, 'Q25', ['Continent','Q2','Q1','Q4','Q5', 'Q15', 'Q20'])

- Popularity of ML is growing but is still in nascent stage
- 

In [None]:
plot_gender_conti(df, 'Q23', ['Continent', 'Q5','Q6','Q20' ,'Q25'])

- In military and security application NA leads the way, 
- Asia leads the way in e-commerce applications and manufacturing industry
- Asia leads the way in adaptation of ML in education (fierce competation in asia and emergence of e-education companies might the reason)

In [None]:
plot_gender_conti(df, 'Q20', ['Continent', 'Q2','Q5','Q6', 'Q15', 'Q25'])

In [None]:
plot_gender_conti(df, 'Q21')

# Multiple Options

In [None]:
def plot_gender_conti_expr_field(data , col):
    selected_columns = list(data.columns[data.columns.str.contains(col)])
    meta_columns =  ['Continent', 'Q2', 'Q4' , 'Q5', 'Q1', 'Q6',  'Q15' ]
    agg_dict = {i:'sum' for i in selected_columns}
    agg_dict.update({'UserId':'count'})
    df = data[selected_columns + meta_columns + ['UserId']]
    
    # distribution of the user for the column
   
    fig = px.bar((df[selected_columns].sum()/df.shape[0]).sort_values().T, title='',  height=300, 
                 orientation='h',labels={'value': 'user ratio'})
    fig.show()
    
    fig, axes = plt.subplots(3,2, figsize=(20,18), sharey=False, sharex=False)
    for ax, col in zip(np.array(axes).reshape(-1), meta_columns):
        df_ = df.groupby(col).agg(agg_dict)
        df_[selected_columns] = df_[selected_columns].div(df_['UserId'], axis=0)
        sns.heatmap(df_[selected_columns], ax=ax,fmt='.2f', annot=True,)
    
    #df_ = df.groupby('Q2').agg(agg_dict)
    #df_[selected_columns] = df_[selected_columns].div(df_['UserId'], axis=0)
    #sns.heatmap(df_[selected_columns], ax=ax[1],fmt='.2f', annot=True)
    
    

In [None]:
plot_gender_conti_expr_field(df, 'Q7')


In [None]:
plot_gender_conti_expr_field(df, 'Q9')

In [None]:
plot_gender_conti_expr_field(df, 'Q16')

In [None]:
def plot_sankey_diagram(data, meta_columns , columns):
    loop_columns = ['Continent'] + list(columns)
    list_df = []
    
    df_users = data.groupby(meta_columns[0])['UserId'].count().reset_index().rename(
        columns={meta_columns[0]:'target', 'UserId':'value'})
    df_users['source'] = 'Users'
    list_df.append(df_users)

    for i, j in zip(meta_columns[0:-1], meta_columns[1:]):
        df_ = df.groupby([i,j])['UserId'].count().reset_index().rename(
        columns={j:'target', 'UserId':'value', i:'source'})
        list_df.append(df_)
    
    selected_columns = list(df.columns[df.columns.str.contains(columns[0])])
    df_ = df[selected_columns + meta_columns + ['UserId']]
    df_ = df_.groupby(meta_columns[-1])[selected_columns].sum()
    df_ = df_.stack().reset_index().rename(columns={meta_columns[-1]:'source', 'level_1':'target', 0:'value'})
    list_df.append(df_)
    
    for i, j in zip(columns[:-1], columns[1:]):
        print(i, j)
        selected_columns = list(df.columns[df.columns.str.contains(i)])
        df_1 = df[selected_columns]
        
        selected_columns = list(df.columns[df.columns.str.contains(j)])
        df_2 = df[selected_columns]
        
        df_ = pd.DataFrame(np.dot(df_1.values.T, df_2.values), index=df_1.columns, 
                           columns=df_2.columns).stack().reset_index().rename(
            columns={'level_0':'source', 'level_1':'target', 0:'value'})
        
        list_df.append(df_)
    
    df_ = pd.concat(list_df)
    all_nodes = list(df_['source'].unique()) + list(df_['target'].unique())
    source_idx = [all_nodes.index(x) for x in df_['source'].values]
    target_idx = [all_nodes.index(x) for x in df_['target'].values]
    weights = df_['value'].values
    #return df_

    fig = go.Figure(data=go.Sankey( node={'label': all_nodes},link={'source':source_idx, 'target':target_idx, 
                                                                    'value':weights}))
    fig.show()
    

In [None]:
plot_sankey_diagram(df, ['Continent', 'Q25'], ['Q7'])