In [None]:
import os

import numpy as np 
import pandas as pd 
pd.set_option('colheader_justify', 'center')
from IPython.core.display import HTML
from scipy import stats
from statsmodels.stats.weightstats import _tconfint_generic
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.graph_objects as go

folder = '/kaggle/input/kaggle-survey-2019'
gender_colors = ['cornflowerblue','lightcoral']

def calculate_percent(series):
    return series.value_counts() / len(series)

def create_countplot(data, x, xlabel=None, ylabel=None, edgecolor=None, params=None):
    sns.countplot(data=data, x=x, order=sorted(data[x].unique()), edgecolor=edgecolor, **params)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    
def plot_pie(size, labels, colors, title):
    plt.title(title, fontsize=14)
    my_circle=plt.Circle( (0,0), 0.7, color='white')
    plt.pie(size, labels=labels, colors=colors)
    p=plt.gcf()
    p.gca().add_artist(my_circle)
    
def plot_interval_diff(groups, labels, gender_colors, title=''):
    marker_style = {'marker': 11, 's': 150}

    plt.figure(figsize=(10,5))
    plt.yticks(ticks=range(1, len(groups)+2, 1), 
               labels=labels)
    plt.xlim(0, groups.max().max()+10)
    plt.ylim(-1, len(labels)+1)

    for i,label in enumerate(labels):
        values = groups.loc[groups.index==label, ].values[0]
        if np.isnan(np.min(values)):
            plt.scatter(values[-1], i+1, color=gender_colors[0], **marker_style)
            plt.hlines(y=i+1, xmin=0, xmax=0)
        else:
            plt.scatter(values[1], [i+1], color=gender_colors[0], **marker_style)
            plt.scatter(values[0], [i+1], color=gender_colors[1], **marker_style)
            plt.hlines(y=i+0.85, xmin=values.min()+0.5, xmax=values.max()-0.5, linewidth=3, color='grey')
            diff = round(abs(values[1] - values[0]), 2)
            plt.text(values.max()+0.8, i+0.7, 'Δ {}%'.format(diff), fontsize=12)
    plt.legend(['Male', 'Female'],ncol=2, mode="expand", loc='lower left', 
               borderpad=1, bbox_to_anchor=(0., 0., 0.5, 0.5))
    plt.xlabel('% in group')
    plt.title(title+'\n', fontsize=20)
    plt.suptitle('\nThe absolute difference between male and female percentage is printed after interval')

def create_percentage_df(df, question, labels):
    groups = df.groupby('Q2')[question].value_counts().unstack().T[['Female','Male']].reindex(labels)
    return groups / groups.sum() * 100

def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):
    # confidence intervals for difference between two Bernulli distributions
    z = stats.norm.ppf(1 - alpha / 2.)
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))  
    return (left_boundary, right_boundary)

def compute_multiple_choices(df, q_number):
    columns = [column for column in df.columns if q_number in column and 'OTHER' not in column]
    gender_n = len(df[df['Q2']=='Female']), len(df[df['Q2']=='Male'])
    result = pd.DataFrame(data={'Female': df[df['Q2']=='Female'][columns].notnull().sum(),
                                'Male': df[df['Q2']=='Male'][columns].notnull().sum()})
    result.index = [df[column].dropna().unique()[0] for column in columns]
    return result / gender_n * 100

def multiple_choices_to_df(df, q_number):
    columns = [column for column in df.columns if q_number in column and 'OTHER' not in column]
    result = pd.DataFrame(data={'Gender': df['Q2'].values})
    for column in columns:
        result = result.merge(df[column].str.get_dummies(), left_index=True, right_index=True)
    return result[result['Gender'].isin(['Male', 'Female'])]

def survey(results, category_names):
    """
    Parameters
    ----------
    results : dict
        A mapping from question labels to a list of answers per category.
        It is assumed all lists contain the same number of entries and that
        it matches the length of *category_names*.
    category_names : list of str
        The category labels.
    """
    labels = list(results.keys())
    data = np.array(list(results.values()))
    data_cum = data.cumsum(axis=1)
    category_colors = plt.get_cmap('bwr')(
        np.linspace(0.15, 0.85, data.shape[1]))

    fig, ax = plt.subplots(figsize=(9.2, 3))
    ax.invert_yaxis()
    ax.xaxis.set_visible(False)
    ax.set_xlim(0, np.sum(data, axis=1).max())

    for i, (colname, color) in enumerate(zip(category_names, category_colors)):
        widths = data[:, i]
        starts = data_cum[:, i] - widths
        ax.barh(labels, widths, left=starts, height=0.5,
                label=colname, color=color)
        xcenters = starts + widths / 2

        r, g, b, _ = color
        text_color = 'white' if r * g * b < 0.5 else 'darkgrey'
        for y, (x, c) in enumerate(zip(xcenters, widths)):
            ax.text(x, y, str(int(c)), ha='center', va='center',
                    color=text_color)
    ax.legend(ncol=len(category_names), bbox_to_anchor=(0, 1),
              loc='lower left', fontsize='small')

    return fig, ax

def plot_modal_salary(df, role_labels):
    job_role_modal_salary = {}
    for job_role in role_labels:
        if job_role not in ['Student', 'Not employed']:
            job_role_modal_salary[job_role] = df[df['Q5']==job_role]['coded_salary'].mode().values.tolist()
    plt.yticks(ticks=range(1, len(job_role_modal_salary.keys())+2, 1), 
                   labels=job_role_modal_salary.keys())
    plt.xticks(ticks=range(1, len(salary_labels)+2, 1), 
                   labels=salary_labels, rotation=20)
    i = 1
    for job, modal_salaries in job_role_modal_salary.items():
        if len(modal_salaries) > 1:
            xmin, xmax = [salary_labels.index(salary)+1 for salary in modal_salaries]
            plt.hlines(y=i, xmin=xmin, xmax=xmax, linestyle='--', color='grey')
        for salary in modal_salaries:
            plt.scatter(salary_labels.index(salary)+1, i, color='green', marker='$\$$', s=300)
        i+=1
    plt.xlim(0, 5)

![](https://sun9-13.userapi.com/c858520/v858520680/65025/5k5drYPk6JQ.jpg)
*Original logo Credits: [widsconference.org](https://www.widsconference.org). I took the liberty of updating it with something actually Russian.*

In [None]:
multiple_choice_responses_2018 = pd.read_csv("../input/kaggle-survey-2018/multipleChoiceResponses.csv", 
                                        low_memory=False).iloc[1:]
russian_subset_2018 = multiple_choice_responses_2018[multiple_choice_responses_2018['Q3']=='Russia']
multiple_choice_responses_2017 = pd.read_csv("../input/kaggle-survey-2017/multipleChoiceResponses.csv", 
                                        low_memory=False,encoding='ISO-8859-1').iloc[1:]
russian_subset_2017 = multiple_choice_responses_2017[multiple_choice_responses_2018['Q3']=='Russia']

survey_schema = pd.read_csv(os.path.join(folder, 'survey_schema.csv'))
questions = pd.read_csv(os.path.join(folder, 'questions_only.csv'))
multiple_choice_responses = pd.read_csv(os.path.join(folder, 'multiple_choice_responses.csv'), 
                                        low_memory=False).iloc[1:]
salary_groups = { '$0-999': '<$10 000', '1,000-1,999': '<$10 000', 
                  '2,000-2,999': '<$10 000',  '3,000-3,999': '<$10 000', 
                  '4,000-4,999': '<$10 000', '5,000-7,499': '<$10 000', 
                  '7,500-9,999': '<$10 000', '10,000-14,999': '$10 000-20 000',
                  '15,000-19,999': '$10 000-20 000', '20,000-24,999': '$20 000-70 000', 
                  '25,000-29,999': '$20 000-70 000', '30,000-39,999': '$20 000-70 000',  
                  '40,000-49,999': '$20 000-70 000', '50,000-59,999': '$20 000-70 000',
                  '60,000-69,999': '$20 000-70 000',  '70,000-79,999': '> $70 000', 
                  '80,000-89,999': '> $70 000', '90,000-99,999': '> $70 000', 
                  '100,000-124,999': '> $70 000', '125,000-149,999': '> $70 000',
                  '150,000-199,999': '> $70 000', '200,000-249,999': '> $70 000', 
                  '250,000-299,999': '> $70 000', '300,000-500,000': '> $70 000',
                  '> $500,000': '> $70 000'}
multiple_choice_responses['coded_salary'] = multiple_choice_responses['Q10'].map(salary_groups)
experience_groups = {}

russian_subset = multiple_choice_responses[multiple_choice_responses['Q3']=='Russia']

for q, text in zip(questions, questions.values.tolist()[0]):
    print(q, text)

# Study Objective

It is said as common knowledge in all blogs and social medias that there are few women in IT field generally and in data science in particular. We can suppose that if the problem with gender diversity in this field really exists it can have regional traits based on cultural background for country. Usually it is asked in interviews and you can find a lot of expert (but personal) opinion about gender diversity and usually experts are skeptical. "We did a lot but more should be done", they sat. But does the real proofs to this opinion exist? I'm actually interested in situation for my home country, Russia.

So study questions are:
> 
> ### Does gender diversity in Russian data science differ in any ways from gender diversity in data science worldwide?

> ### Does any inequality based in gender exist between female and male data scientist in Russia?

As data scientist and women myself I supposed that research this questions from statistical and analytical point of view will by my honor and duty (to declare in some ironic sense;  actually, it's just fun to study something we talk a lot with data and numbers). Such worldwide initiatives to inspire women to learn python and machine learning as PyLadies and Women in Machine Learning (also known as WiDS) are taking places in Russia too. One of large data science conferences in Russia, DataFest, also has WiDS workshops in its schedule. 

But how effective it is? And does it matter at all?

# Dataset description

It is the third time Kaggle conducted the survey conducted among its users. 

It cannot be said that data scientists from all over the world are represented equally: the largest number of participants are from India (24.27%), the second choice is United States of America (15.65%). My home country, Russia, is represented by 3.17% participants (see table "Survey participants proportion per country, top-5" below). You can reasonably ask if it is enough sample for comparison.

In Russia, we haven't a big tradition for answering online survey and usually response rate in online public opinion surveys is low. But it is higher for surveys in professional groups with high group loyalty (and I believe it can be said about data scientists) and especially higher for women rather then men. So theoretically we can find more women than really there are between kaggler. And we can expect that the country proportion is close to real one. Though it is naive theory in some way, we haven't better data for now.

In [None]:
country_proportion = pd.DataFrame(data={'count': (multiple_choice_responses['Q3'].value_counts().head(6)),
                                       '%': (calculate_percent(multiple_choice_responses['Q3'])).head(6)}).drop(index='Other')
styles = [
    dict(selector="th", props=[("text-align", "left")]),
    dict(selector="row_heading", props=[("text-align", "right")]),
    dict(selector="caption", props=[("font-size", "150%"), ("text-align", "center"), ("color", "black")],
    )
]

country_proportion.style.set_table_styles(styles).set_caption('Survey participants proportion per country, top-5')\
                .set_properties(**{'width': '120px'})\
                .background_gradient(cmap=sns.light_palette('orange', as_cmap=True), subset='%').format("{:.2%}", subset='%')\
                .background_gradient(cmap=sns.light_palette('red', as_cmap=True), subset='count')

Actually proportion for kagglers from Russia in whole data set is the same for several years:

In [None]:
all_years_number = pd.DataFrame(data={'2017': len(russian_subset_2017)/len(multiple_choice_responses_2017), 
                                      '2018': len(russian_subset_2018)/len(multiple_choice_responses_2018), 
                                      '2019': len(russian_subset)/len(multiple_choice_responses)},
                               index=['%']).T
all_years_number.style.set_table_styles(styles).set_caption('Percent participants from Russia')\
                .set_properties(**{'width': '230px'}).format("{:.2%}")\
                .background_gradient(cmap=sns.light_palette('blue', as_cmap=True))

After look at the map below it can be said that, though Russian Kaggle community isn't the biggest one, it is among large ones. For example its size is close to Chinese community provided that [China population is almost ten times more](https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population).

It is interested to remember the proportion in this plot and compare it with female proportion between data scientists in each country we will examine below.

In [None]:
percent_per_country = (calculate_percent(multiple_choice_responses['Q3'])*100).to_frame().reset_index()\
                                            .rename(columns={'index': 'country name','Q3': '%'})

fig = go.Figure(data=go.Choropleth(
    locations=percent_per_country['country name'],
    z = percent_per_country['%'], 
    locationmode='country names', 
    colorscale='ylorrd', colorbar_title='%',
    zmax=5
))

fig.update_layout(title={
        'text': "Kaggle survey respondents, % in 2019 sample per country",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
)

fig.show()

# Gender proportion in data science worldwide and in Russia

## Female focus declaration

A few survey participants selected not to say their gender identity or prefer self-describing rather then male/female choice (see table "Gender proportion" below). In this study I leave such participants outside of analysis scope. 

In [None]:
gender_proportion = pd.DataFrame(data={'...worldwide': calculate_percent(multiple_choice_responses['Q2']),
                                       '...in Russian': calculate_percent(russian_subset['Q2'])})

gender_proportion.style.set_table_styles(styles).set_caption('Gender proportion...')\
        .set_properties(**{'width': '120px'}).format("{:.2%}").background_gradient(cmap=sns.light_palette('green', as_cmap=True))

On the one hand, I want to focus on the characteristics of female data scientists as a group in comparison with male data scientists. On the other hand, the representation of  people with a alternative gender identity is not sufficient for reliable statistic conclusions.

## Gender proportion 

There are fewer women in Russian data science than there are around the world: in Russian subset the female percentage is 4.18% shorter in comparison to worldwide female percentage. 

In [None]:
names = multiple_choice_responses['Q2'].value_counts().index[:2]
all_size = multiple_choice_responses['Q2'].value_counts().values[:2]
all_names = ['{}, {}%'.format(sex, round(value/sum(all_size)*100, 2)) 
             for sex, value in zip(names, all_size)]
russian_size = russian_subset['Q2'].value_counts().values[:2]
russian_names = ['{}, {}%'.format(sex, round(value/sum(russian_size)*100, 2)) 
                 for sex, value in zip(names, russian_size)]

plt.figure(figsize=(12,10))
plt.suptitle('Gender proportion for kagglers...', fontsize=20)

plt.subplot(1,2,1)
plot_pie(all_size, all_names, gender_colors, '...worldwide')

plt.subplot(1,2,2)
plot_pie(russian_size, russian_names, gender_colors, '...in Russia')

plt.subplots_adjust(top=1.25)
plt.show()

Is it big differernce or just statistical  casualty? Let's compute the confidence interval for the differences to see if there is a zero (or no difference at all) in it:

In [None]:
z_stat_for_proportion = proportions_ztest(russian_subset['Q2'].map({'Female': 1, 'Male': 0}).sum(),
                                 len(russian_subset['Q2'].map({'Female': 1, 'Male': 0}).dropna()),
                                 multiple_choice_responses['Q2'].map({'Female': 1, 'Male': 0}).mean())
print('Statistic for z-test is {:0.2f} and p-value is {:0.3f}.'.format(*z_stat_for_proportion))

conf_interval_for_difference_in_women = proportions_diff_confint_ind(
                            multiple_choice_responses['Q2'].map({'Female': 1, 'Male': 0}).dropna(),
                            russian_subset['Q2'].map({'Female': 1, 'Male': 0}).dropna())

print('95%% confidence interval for a difference between women worldwide and from Russia: {}-{}%'
      .format(*[round(el * 100, 2) for el in conf_interval_for_difference_in_women]))

So there is the statistical significance. At least, there are fewer women in Russian data science for 1.53% then worldwide.

In comparison with other Kaggle country communities it is seen that there are fewer women among Russian kagglers:

In [None]:
female_percent_per_country = ((multiple_choice_responses.groupby('Q3')['Q2'].value_counts().unstack()['Female'].sort_index() / 
                                multiple_choice_responses['Q3'].value_counts().sort_index()) * 100) \
                                .to_frame().reset_index().rename(columns={'Q3': 'country name', 0: 'female %'})

female_percent_per_country = female_percent_per_country[female_percent_per_country['country name'].isin(multiple_choice_responses['Q3'].value_counts().reset_index(name="count").query('count >= 100')['index'].values)]

fig = go.Figure(data=go.Choropleth(
    locations=female_percent_per_country['country name'], # Spatial coordinates
    z=female_percent_per_country['female %'], # Data to be color-coded
    locationmode='country names', # set of locations match entries in `locations`
    colorscale='reds',
    colorbar=dict(
        title="female %",
        titleside="top",
        tickmode="array",
        tickvals=list(range(0, 60, 10)),
        ticks="outside"
    )
))

fig.update_layout(title={
        'text': "Kaggle survey female respondents, % in 2019 survey sample per country<br>(for countries with 100+ participants)",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
        
fig.show()

The strange thing is that if we look backwards and compare the gender proportion with 2017-2018 surveys — we find out that number of Russian female kagglers has decreased.

In [None]:
category_names = ['Male', 'Female']

gender_2018 = russian_subset_2018['Q1'].value_counts().values[:2]
gender_2017 = russian_subset_2017['GenderSelect'].value_counts().values[:2]

results = {
    '2017': gender_2017 / gender_2017.sum() * 100,
    '2018':  gender_2018 / gender_2018.sum() * 100,
    '2019':  russian_size / russian_size.sum() * 100,
}

survey(results, category_names)
plt.title('Gender proportion among Russian kagglers\nfor 2017-2019 surveys\n', fontsize=20)
plt.show()

# Age proportion for kagglers in Russia

Modal age category worldwide and in Russian subset is 25-29 years.  In comparison to worldwide dataset we can see that among Russian kagglers there are more from age group "30-34".

In [None]:
ageplot_params = {'facecolor': (0, 0, 0, 0),
                    'linewidth': 5}

ages_dist = pd.DataFrame(data={'in general': multiple_choice_responses['Q1'].value_counts()\
                                   .sort_index() / len(multiple_choice_responses) * 100,
                              'from Russia': russian_subset['Q1'].value_counts()\
                                   .sort_index() / len(russian_subset) * 100})

plt.figure(figsize=(15,5))
plt.suptitle('Age groups for kagglers...', fontsize=20)

plt.subplot(1,2,1)
plt.title('...worldwide', fontsize=14)
create_countplot(multiple_choice_responses,'Q1', edgecolor='gray', params=ageplot_params)

plt.subplot(1,2,2)
plt.title('...from Russia', fontsize=14)
create_countplot(russian_subset,'Q1', edgecolor='blue', params=ageplot_params)

plt.subplots_adjust(top=0.8)
plt.show()

differentage_groups = []

print('Check for statistical significance in percentage for age groups:')
for age in sorted(multiple_choice_responses['Q1'].unique()):
    z_stat_for_age = proportions_ztest(russian_subset['Q1'].map({age: 1}).sum(),
                                     len(russian_subset['Q1'].dropna()),
                                     multiple_choice_responses['Q1'].map({age: 1}).fillna(0).mean())
    if z_stat_for_age[1] < 0.05:
        differentage_groups.append(age)
        
print('Age groups: {} — has statistically significant differences in size worldwide and in Russia.'\
      .format(', '.join(differentage_groups)))

It is interesting that in category "30-34 years" there are less women than in other age groups. Female data scientists from Russia seems to be more often younger than their male colleagues. 

In [None]:
men_params = {'facecolor': (0, 0, 0, 0),
              'linewidth': 3}
women_params = {'linewidth': 3,
                'color': gender_colors[1]}
age_labels = {'xlabel': 'age category', 
              'ylabel': 'count'}

plt.figure(figsize=(7,5))
create_countplot(russian_subset[russian_subset['Q2']=='Male'], 'Q1', 
                 edgecolor=gender_colors[0], params=men_params, **age_labels)
create_countplot(russian_subset[russian_subset['Q2']=='Female'], 'Q1', 
                 edgecolor='black', params=women_params, **age_labels)

plt.legend(['Men count', 'Women count'])
plt.title('Proportion for men and women \nfrom Russia in age groups\n', fontsize=20)
plt.show()

# Gender salary gap

The first reasonable question should be if it is really exists. It is easy to check with plotting absolute difference for each salary group for men and women in data science. In case of my study I chose to recode salary categories from original surveys. I must admit that there is no scientific basis behind this (but I hope that one day there will be fresh open data for data scientists' salaries in Russsia). In recoding I relied on my knowledge of the professional field and a rough ideas of the distribution of salaries depending on work experience and skills.

It is interesting to know that no Russian female data scientist mentioned year compansation more $70 000 though we can't discuss the real reasons for such absence (and I don't wont to speculate).

As you can see on plot "Differences in salary for Russian kagglers" below the salary gap definitely  exists. After test for statistical significance we can say that definitely there are lower and middle paid data scientists between women and men are more likely to be paid better. 

In [None]:
salary_labels = ['<$10 000', '$10 000-20 000', '$20 000-70 000', '> $70 000']
salary_groups = create_percentage_df(russian_subset, 'coded_salary', salary_labels)

plot_interval_diff(salary_groups, salary_labels, gender_colors, 
                   title='Differences in salary for Russian kagglers')
plt.show()

differentage_groups = []
for salary in salary_labels[:-1]:
    z_stat = proportions_ztest(russian_subset['coded_salary'].map({salary: 1}).sum(),
                               len(russian_subset['coded_salary'].dropna()),
                               russian_subset['coded_salary'].map({salary: 1}).fillna(0).mean())
    if z_stat[1] < 0.05:
        differentage_groups.append(salary)
        
print('Salary groups: {} — has statistically significant differences in size.'\
      .format(', '.join(differentage_groups)))

Can we find any reasons for salary gap?

First idea is differences in work experience in general and maybe in educational level in particular. We have already seen that women in Russian data science is more often younger. Maybe, they are just young professionals and that's why are less paid?

Male data scientists seem to have master and professional degrees more often. Though the difference in numbers of kaggler with doctoral degree looks promising, it doesn't pass test for statistical significance in differences. 

In [None]:
education_labels = ['Bachelor’s degree', 'Professional degree', 'Master’s degree', 'Doctoral degree',        
                   'Some college/university study without earning a bachelor’s degree',
                    'No formal education past high school',
                   'I prefer not to answer']
educational_groups = create_percentage_df(russian_subset, 'Q4', education_labels)

plot_interval_diff(educational_groups, education_labels[:-3], gender_colors, 
                   title='Differences in education for Russian kaggler')
plt.show()

differentage_groups = []
for education in education_labels[:-3]:
    z_stat = proportions_ztest(russian_subset['Q4'].map({education: 1}).sum(),
                               len(russian_subset['Q4'].dropna()),
                               russian_subset['Q4'].map({education: 1}).fillna(0).mean())
    if z_stat[1] < 0.05:
        differentage_groups.append(education)
if len(differentage_groups) == 0:
    differentage_groups.append('no groups')
        
print('Education groups: {} — has statistically significant differences in size.'\
      .format(', '.join(differentage_groups)))

* * *

**Statistical note**: *Although for edication level and in next questions tests do not find a statistically significant difference in the size of the groups — it cannot be said that there are no difference for sure. Based on the value of p-value we can't reject our null hypothesis — but it does not mean automatically that null hypothesis about "no difference" is true.*

* * *

Can it be that female kaggler are students for now and haven't yet finished their master thesis? 

Let's look for job role reported in the survey by men and women from Russia at the 'Differences in job role for Russian kagglers' plot below. We can see vizual differences but can't speak about it from statistical point of view. It has some answers but gives us new questions. Female kagglers aren't data scientists for their job title but more often are research scientists.

In [None]:
role_labels = ['Data Scientist', 'Software Engineer', 'Student', 'Research Scientist',
       'Data Analyst', 'Other', 'Not employed', 'Business Analyst',
       'Product/Project Manager', 'Data Engineer', 'DBA/Database Engineer',
       'Statistician']
role_groups = create_percentage_df(russian_subset, 'Q5', role_labels)

roles_to_drop = ['DBA/Database Engineer', 'Statistician', 'Other','Product/Project Manager']
role_groups = role_groups[~role_groups.index.isin(roles_to_drop)]
role_labels = [role for role in role_labels if role not in roles_to_drop]

plot_interval_diff(role_groups, role_labels[:-2], gender_colors, 
                   title='Differences in job role for Russian kagglers')
plt.show()

differenage_groups = []
for role in role_labels[:-3]:
    z_stat = proportions_ztest(russian_subset['Q5'].map({role: 1}).sum(),
                               len(russian_subset['Q5'].dropna()),
                               russian_subset['Q5'].map({role: 1}).fillna(0).mean())
    if z_stat[1] < 0.05:
        differenage_groups.append(role)
if len(differentage_groups) == 0:
    differentage_groups.append('no groups') 
print('Job roles: {} — has statistically significant differences in size.'\
      .format(', '.join(differentage_groups)))

On the plot "Modal salary categories for job roles" below it is shown that Russian data scientists actually have better salaries in comparison to research scientist. There is lack of data for Russian specialist, and it can be a reason for other job titles to have bimodal distribution for salaries. So the general trend is easier to see in worldwide part of plot.

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plot_modal_salary(multiple_choice_responses, role_labels)
plt.title('...worldwide', fontsize=14)
plt.subplot(1,2,2)
plot_modal_salary(russian_subset, role_labels)
plt.title('...in Russia', fontsize=14)

plt.subplots_adjust(top=0.8, wspace=0.3)
plt.suptitle('Modal salary categories for job roles\n', fontsize=20)
plt.show()

I suppose that the one of the important reasons for salary gap could be the difference in coding experience as we can see below on the plot 'Differences in coding experience for Russian kagglers'.

In [None]:
experience_labels = ['I have never written code', '< 1 years', '1-2 years', '3-5 years', 
                     '5-10 years', '10-20 years', '20+ years']
experience_groups = create_percentage_df(russian_subset, 'Q15', experience_labels)

ex_to_sum = ['3-5 years','5-10 years', '10-20 years', '20+ years']
experience_groups.loc['3+ years'] = experience_groups[experience_groups.index.isin(ex_to_sum)].sum()
experience_groups = experience_groups[~experience_groups.index.isin(ex_to_sum)]

plot_interval_diff(experience_groups, experience_groups.index, gender_colors, 
                   title='Differences in coding experience for Russian kagglers')
plt.show()

Unfortunately, in the Russian subset, only a few respondents satisfied the condition for questions about programming languages or ML algorithms they use on a regular basis. So we can't analyze them. I hope in future surveys there will be more data, and we can study regional specifics.

# Conclusions

1. Russian Kaggle community has fewer percentage of women members than there are among kagglers worldwide. It is less at least on 1.53% in comparison with general data set.
2. There is a salary gap connected with gender. 
3. There are several reasons which could explain salary gap (and some theories we can't test for now). Firstly, women from Russia in Kaggle seems to be younger than Russian male data scientists. Secondly, often they aren't professional data scientists, so their professional salary grades can be generally lower. Thirdly, female kagglers from Russia have the lack of coding experience in comparison with their male collegues.

So I can't say that in this data set we can find evidences for real gender inequality. But still there are just few female data scientists and there is a great opportunity to encourage more women in my home country to try to join the field.