In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/f/fe/Flag_of_Egypt.svg/800px-Flag_of_Egypt.svg.png" style="width: 60%; margin: auto; padding-bottom: 30px;">


# Egyptian Kagglers: The State of Data Science in Egypt.


Hello hello, My name is Ahmed Samir, and in this notebook we'll go on a journey to explore what Data Scientists (and Data professionals in general) in Egypt have to tell us.

**As an aspiring Data Scientist, I wanted to shed the light over the practices and experience of those already in the field in Egypt, to get an understanding of how one should formulate the next year's strides. And I guess this would be helpful to all of my fellow Egyptian aspiring Data Scientists.**

I've tried making several analysis in the past, and I didn't put much effort into any of them unfortunately. But in this one I'll try to put a real effort into making it a good analysis, and to learn some new stuff along the way. 

**This work in this notebook will be incremental, so you might want to check every day or the other if you are interested.**

## Reading Survey Data

In [None]:
df_2021 = pd.read_csv('/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')
df_2020 = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')
df_2019 = pd.read_csv('/kaggle/input/kaggle-survey-2019/multiple_choice_responses.csv')
df_2018 = pd.read_csv('/kaggle/input/kaggle-survey-2018/multipleChoiceResponses.csv')

In [None]:
df_2021.head()

In [None]:
df_2020.head()

In [None]:
df_2019.head()

In [None]:
df_2018.head()

## How many of the respondents where Egyptian this year? 

## And what is the rank of Egypt in terms of respondents count?

In [None]:
egy_df = df_2021.query('Q3 == "Egypt"')
print("Egyptian Respondents:", egy_df.shape[0])

In [None]:
import matplotlib.pyplot as plt

# Replace some countries names to make the plot ticks more manageable
df_2021['Q3'].replace({
    'United States of America': 'USA',
    'United Kingdom of Great Britain and Northern Ireland': 'UK'
}, inplace=True)

# Calculate number of respondents per Country in 2021
country_cnt_2021 = df_2021['Q3'].value_counts();

# Plot figure object
fig, ax = plt.subplots(1, 1, figsize=(14.4, 4), dpi=120);

# Bar plot 
bar_colors = ['#000000'] * 7 + ['#ffffff'] * 3 + ['#c19400'] + ['#ffffff'] * 3 + ['#cf0921'] * 7
ax.bar(country_cnt_2021.index[:21],
        country_cnt_2021[:21],
        color=bar_colors,
        edgecolor='black',
        width=0.9);

# Set title
ax.set_title('Number of Respondents per Country',
             fontsize=16, fontfamily='serif', fontweight='bold', color='#4a4a4a');
ax.text(6.5, 7400, 'With Emphasis on Egypt\'s Respondents', 
        fontsize=12, fontfamily='serif', color='#4a4a4a')

# Axes xticks rotation
ax.set_xticklabels(labels=country_cnt_2021.index[:21], rotation=60);

# Remove yticks
ax.tick_params(left=False, bottom=False);
ax.set_yticklabels([]);

# Remove spines
ax.spines['top'].set_visible(False);
ax.spines['right'].set_visible(False);
ax.spines['left'].set_visible(False);

# Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_height(),
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height() + 300),
                ha='center',
                va='center',
                size=7 if i != 10 else 14);

We can see that the rank of Egypt is the **11th** in this survey, with **482** respondents. Now I don't remember Egypt being close to that rank last year, so let's look at the rank and number of respondents of Egypt in all the previous surveys.

In [None]:
# Define helper functions for preparation of respondents count and metric in every survey
def get_country_respondents_count(df, col, country):
    return df[df[col] == country].shape[0]

def get_country_respondents_rank(df, col, country):
    # Calculate each country's respondent count in pandas series
    countries_counts = df[col].value_counts()
    
    # Reset index of the series 
    countries_counts = countries_counts.reset_index()
        
    # Return the rank of the country of choice
    return countries_counts[countries_counts['index'] == country].index.item() + 1

In [None]:
# Initialize a list containing all the surverys
dfs_list = [df_2018, df_2019, df_2020, df_2021]

# Populate a dataframe with the counts and rank of all survery for Egypt
egy_per_yr = pd.DataFrame({
    'year': [2018, 2019, 2020, 2021],
    'respondents': [get_country_respondents_count(df, 'Q3', 'Egypt') for df in dfs_list],
    'rank': [get_country_respondents_rank(df, 'Q3', 'Egypt') for df in dfs_list]
})

egy_per_yr

In [None]:
# TODO: Draw a line plot with two y-axes for respondents count and rank

# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(6, 4), dpi=140);

bar_color = '#c19400'

# Make respondents count line plot
# l1 = ax.plot(egy_per_yr.year,
#             egy_per_yr.respondents,
#             color='#000000',
#             marker='o',
#             label='Respondents')

ax.bar(egy_per_yr.year,
       egy_per_yr.respondents,
       color=bar_color,
       edgecolor='#000000',
       width=0.8,
       label='Respondents')

# Increase axis limit
ax.set_ylim([0, 650]);

# Set axes labels
ax.set_xlabel('Year', fontsize=12);
# ax.set_ylabel('Respondents', fontsize=12);

# Set axes ticks and labels
ax.set_xticks(ticks=egy_per_yr.year)
ax.set_xticklabels(labels=egy_per_yr.year);

# Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_height(),
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')


# Make a twin ax object to plot the rank in the same plot
ax2 = ax.twinx()

# Make rank line plot
l2 = ax2.plot(egy_per_yr.year,
              egy_per_yr['rank'],
              color='#4a4a4a',
              linestyle=':',
              linewidth=1.0,
              marker='o',
              markersize=5,
              markerfacecolor='#c19400',
              label='Rank')


# Invert axis
ax2.invert_yaxis();

# Increase axis limit
ax2.set_ylim([55, 5]);

# # Set axes labels
# ax2.set_ylabel('Rank', fontsize=12);


# Set title
ax.text(2017.3, 677, 'Egyptian Presence in Kaggle\'s Surveys',
             fontsize=15, fontfamily='serif', fontweight='bold', color='#4a4a4a');
ax.text(2017.3, 630, 'Respondents and Rank (2018-2021)', 
        fontsize=13, fontfamily='serif', color='#4a4a4a')

# Remove upper axis
ax.spines['top'].set_visible(False);
ax2.spines['top'].set_visible(False);
ax.spines['right'].set_visible(False);
ax2.spines['right'].set_visible(False);
ax.spines['left'].set_visible(False);
ax2.spines['left'].set_visible(False);

# Remove yticks
ax.tick_params(left=False);
ax.set_yticklabels([]);
ax2.tick_params(right=False);
ax2.set_yticklabels([]);

# Annotate line plot
for row in egy_per_yr.itertuples():
    ax2.annotate(text=row.rank,
                xy=(row.year, row.rank-3),
                ha='center',
                va='center',
                size=10,
                color='#000000')

# Make legend
fig.legend(loc='center left');



We can see that there is a steady decrease in the rank of Egypt (Which is a good thing), where Egypt's rank rise with an average of 10 places each year. While the increase in the number of respondents 

## How Mature is The Egyptian Data Science Field Compared to India and USA? 

We can compare Egypt to them in terms of their proportions of formal educations and how the programming experience and ages are distributed.

For this question, I'll look at Data Scientists and Machine Learning Engineers.

In [None]:
# Define Donut Chart helper function
def plt_donut(data, colors_dict, labels=True, fontsize=7, linewidth=2):
    """Function to plot donut chart."""
    data = data[data > 0]
    colors = [colors_dict[col] for col in data.index if col]
    if labels:
        labels = ['{:.2f}%'.format(x*100) for x in (data/data.sum()).values if x > 0]
        patches, text = plt.pie(data, labels=labels, colors=colors,
        wedgeprops = { 'linewidth' : 2, 'edgecolor' : 'white' },
        textprops = {'fontsize': fontsize,})
    else:
        patches, text = plt.pie(data, colors=colors,
        wedgeprops = { 'linewidth' : 2, 'edgecolor' : 'white' },
        textprops = {'fontsize': fontsize,})
    my_circle=plt.Circle( (0,0), 0.7, color='white',)
    p=plt.gcf()
    p.gca().add_artist(my_circle)
    return patches

In [None]:
# Create dataframe containing only Data Scientists and ML Engineers in Egypt, USA and India
india_usa_egy_ds_ml_df = df_2021[(df_2021['Q3'].isin(['Egypt', 'USA', 'India'])) &
                              (df_2021['Q5'].isin(['Data Scientist', 'Machine Learning Engineer']))]

# Initialize figure
fig = plt.figure(figsize=(12, 4), dpi=140)

# Set figure title
fig.suptitle('How Mature is The Egyptian Data Science Field Compared to India and The USA?', fontfamily='serif', fontsize=18)

# Set colors dictionary
colors_dict = {'Bachelor’s degree': '#c19400',
               'Master’s degree': '#000000',
               'Doctoral degree': '#cf0921',
               'Other': 'silver'}

# Loop over each country and it's education df
for i, (country, country_ds_ml_df) in enumerate(india_usa_egy_ds_ml_df.groupby('Q3')):
    
    # Calculate the proportion of each country's formal education
    country_ds_ml_edu_df = country_ds_ml_df['Q4'].value_counts() / country_ds_ml_df.shape[0]
        
    # Change values other than Bachelor's, Master's and Doctorates to Other
    country_ds_ml_edu_df.index = [education \
                                  if education in colors_dict.keys() else 'Other' \
                                      for education in country_ds_ml_edu_df.index]
    
    country_ds_ml_edu_df = country_ds_ml_edu_df.reset_index().groupby('index').sum()
    
    # Arrange index of each dataframe
    country_ds_ml_edu_df.index = ['Bachelor’s degree', 'Master’s degree', 'Doctoral degree', 'Other']
    
    # Draw Donut Chart of country's education proportions
    plt.subplot(1, 4, i+2)
    patches = plt_donut(country_ds_ml_edu_df['Q4'], colors_dict);
    
    # Set country as title of each donut
    plt.title(country, fontfamily='serif', fontweight='bold')
    

# Adjust Plot
plt.subplot(1, 4, 1)
plt.axis('off')
fig.tight_layout()
fig.subplots_adjust(wspace=0.2, top=0.8)

# Plot Legend
plt.legend(patches, country_ds_ml_edu_df.index, loc='best');

It is obvious how the educational landscapes are different between the 3 countries. If we assume that the USA is the most developed in Data Science, we can see that this development is correlated with larger proportions of graduates, which compose almost **82%** of the respondents. 

Also Master's degree holding Data Scientists are most abundant in the US (**24.5%** of respondents), while in Egypt (**5%**) and India (**4%**) they aren't as much.

## What about the programming experience of Data Scientists in Egypt?

In [None]:
# Stacked Bar Chart

# Make a dictionary to hold programming experience as keys and list of ordered group counts of programming exp in each country as a list 
prog_countries_dict = {}

# Make a list to specify order of items
programming_experience_list = ['I have never written code', '< 1 years', '1-3 years', '3-5 years', '5-10 years', '10-20 years', '20+ years']
countries_list = ['Egypt', 'India', 'USA']
tab_colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive']

# Get the count of each country's programming experiences
india_usa_egy_ds_ml_prog_df = india_usa_egy_ds_ml_df.groupby('Q6').apply(lambda x: x['Q3'].value_counts())

# Populate the dict
for prog_exp in programming_experience_list:
    
    # Filter the programming experience data
    prog_exp_df = india_usa_egy_ds_ml_prog_df[prog_exp]
    
    # Add key-value pair to dict with custom order of countries
    prog_countries_dict[prog_exp] = prog_exp_df[countries_list].tolist()
    
# Convert to dataframe
prog_countries_df = pd.DataFrame(prog_countries_dict, index=countries_list).T

# Calculate percentages of each programming experience per country
prog_countries_df = (prog_countries_df * 100) / prog_countries_df.sum()

# Plot Parameters
bar_height = 0.85
r = [0, 1, 2]
fig, ax = plt.subplots(1, 1, figsize=(15, 5), dpi=140)

for i, prog_exp in enumerate(programming_experience_list):    
    
    # Create Programming Experience Bars
    left = prog_countries_df.iloc[:i].sum().values

    plt.barh(y=r,
             width=prog_countries_df.loc[prog_exp],
             left=prog_countries_df.iloc[:i].sum().values,
             color=tab_colors[i],
             edgecolor='#000000',
             height=bar_height,
             label=prog_exp)

# Custom x axis
plt.xlim(-1, 102)
plt.ylim(-0.5, 2.7)

# Legend
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Remove axis
for spine in ['top', 'left', 'right']:
    ax.spines[spine].set_visible(False)

# Reset axes tick labels
ax.tick_params(left=False);
ax.set_yticks(r)
ax.set_yticklabels(countries_list, fontsize=12);
ax.set_xticks([0, 25, 50, 75, 100])
ax.set_xticklabels(['0%', '25%', '50%', '75%', '100%']);

# Set title
ax.set_title('How Mature is The Egyptian Data Science Field Compared to India and The USA?', fontfamily='serif', fontsize=16,
             fontweight='bold', color='#4a4a4a')
ax.text(31, 2.56, 'Comparing Programming Experience', 
        fontsize=15, fontfamily='serif', color='#4a4a4a')
 
# Show graphic
plt.show()

We can see that the maturity of individuals in terms of their programming experience in the US is significantly higher than that of both Egypt and India. On the other hand, The majority of Data Scientists in Egypt have been coding for 3 years or less, and we can see a similar structure of experience in India.

#### This serves to show that the Data Science field in Egypt is still premature, and there is plenty of room for expansion and maturation of the field along with the individuals in it.

## What about the ages of Data Scientists in Egypt?

With information we have so far, we can expect the ages of the the Egyptian respondents to be younger than the US. 

In [None]:
# Make a dictionary to hold age as keys and list of ordered group counts of programming exp in each country as a list 
age_countries_dict = {}

# Make a list to specify order of items
age_groups = ['18-21', '22-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-69', '70+']
countries_list = ['Egypt', 'USA']
tab_colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'white', 'black']
    
# Convert to dataframe
age_countries_df = df_2021.pivot_table(index='Q1', columns='Q3', aggfunc='size', fill_value=0)[countries_list]

# Calculate percentages of each programming experience per country
age_countries_df = (age_countries_df * 100) / age_countries_df.sum()

# Plot Parameters
bar_height = 0.85
r = [0, 1]
fig, ax = plt.subplots(1, 1, figsize=(15, 5), dpi=140)

for i, age_group in enumerate(age_groups):    
    
    # Create Programming Experience Bars
    left = age_countries_df.iloc[:i].sum().values

    plt.barh(y=r,
             width=age_countries_df.loc[age_group],
             left=age_countries_df.iloc[:i].sum().values,
             color=tab_colors[i],
             edgecolor='#000000',
             height=bar_height,
             label=age_group)

# Custom x axis
plt.xlim(-1, 102)
plt.ylim(-0.5, 1.7)

# Legend
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Remove axis
for spine in ['top', 'left', 'right']:
    ax.spines[spine].set_visible(False)

# Reset axes tick labels
ax.tick_params(left=False);
ax.set_yticks(r)
ax.set_yticklabels(countries_list, fontsize=12);
ax.set_xticks([0, 25, 50, 75, 100])
ax.set_xticklabels(['0%', '25%', '50%', '75%', '100%']);

# Set title
ax.set_title('How Mature is The Egyptian Data Science Field Compared to India and The USA?', fontfamily='serif', fontsize=16,
             fontweight='bold', color='#4a4a4a')
ax.text(38, 1.56, 'Comparing Age Groups', 
        fontsize=15, fontfamily='serif', color='#4a4a4a')
 
# Show graphic
plt.show()

It's plain obvious that the sample of Egyptian Data Scientists is small, but the sheer difference between the age groups, the programming experience and level of formal education attained by the respondents in Egypt and the US who claim to be Data Scientists is mind baffling. 

**Here we see in Egypt, a field that is almost completely comprised by people below 35 years old, while in the US that age group doesn't even comprise 50% of the sample.**


## Is this good news? Probably yes.
I hope that the level of prematurity we see in this survey signals a potential for growth that we can witness the next years.

We can certainly see the attempts of the government to boost the presence of the Data Science in Egypt through scholarships like the <a href="https://mcit.gov.eg/en/Human_Capacity/MCIT/Practical_Data_Scientist_Academy_Amazon_Web_Services"> MCIT Practicial Data Scientist Academy with AWS</a>, and <a href="https://egfwd.com/">ITIDA's egyFWD with Udacity</a>.


## Now let's explore how Egyptian Data Scientists do there work


### Programming Languages

In [None]:
def melt_columns_starting_with(df, col_start):
    # Filter columns starting with col_start
    df = df.loc[:, df.columns.str.startswith(col_start)]

    # Melt filtered data from wide to long format and drop null values
    df = pd.melt(df).dropna()
    
    return df

In [None]:
# colors
bar_color = '#c19400'
title_color = '#404040'
subtitle_color = '#4a4a4a'


In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

egy_ds_ml_df = df_2021[(df_2021['Q3'].isin(['Egypt'])) &
                       (df_2021['Q5'].isin(['Data Scientist', 'Machine Learning Engineer']))]

# Calculate programming languages counts Egypt from Q7
egy_prog_langs_count = melt_columns_starting_with(egy_ds_ml_df, 'Q7')['value'].value_counts()

ax.bar(egy_prog_langs_count.index,
       egy_prog_langs_count,
       color=bar_color,
       edgecolor='#000000',
       width=0.9);

# Increase axis limit
ax.set_ylim([0, 130]);

# Set axes labels
# ax.set_xlabel('Language', fontsize=12);
# ax.set_ylabel('Respondents', fontsize=12);

# Set axes ticks and labels
# ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_xticklabels(labels=egy_prog_langs_count.index, fontsize=9);
ax.set_yticks([]);


# Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_height(),
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# Hide spines
for spine in ['top', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.set_title('What programming languages do you use on a regular basis?', fontfamily='serif', fontsize=13,
             fontweight='bold', color=title_color)
ax.text(2.7, 123, 'Egyptian Data Scientists Answered', 
        fontsize=12, fontfamily='serif', color=subtitle_color);

Python dominates as expected, followed by SQL and then ***C++ and Java?***. I've honestly never heard of anyone recommending C++ or Java for Data Science. Maybe for learing data structures and algorithms, but for Data Science, I never heard of it. This can make a nice detour to see the interaction between the users of these languages and how they use other technologies.

### We can see that R seems to be significantly under represented with our Egyptian Data Scientists sample. But what about the other Data professionals? 

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 3.5), dpi=140);

egy_df = df_2021[(df_2021['Q3'].isin(['Egypt']))]

# Calculate programming languages counts Egypt from Q7
egy_prog_langs_count = melt_columns_starting_with(egy_df, 'Q7')['value'].value_counts()

ax.bar(egy_prog_langs_count.index,
       egy_prog_langs_count,
       color=bar_color,
       edgecolor='#000000',
       width=0.9);

# Increase axis limit
ax.set_ylim([0, 460]);

# Set axes labels
# ax.set_xlabel('Language', fontsize=12);
# ax.set_ylabel('Respondents', fontsize=12);

# Set axes ticks and labels
# ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_xticklabels(labels=egy_prog_langs_count.index, fontsize=9);
ax.set_yticks([]);


# Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_height(),
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height() + 15),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# Hide spines
for spine in ['top', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.set_title('What programming languages do you use on a regular basis?', fontfamily='serif', fontsize=13,
             fontweight='bold', color=title_color)
ax.text(2.7, 440, 'Egyptian Data Professionals Answered', 
        fontsize=12, fontfamily='serif', color=subtitle_color);

We can still see that the popularity of R is below languages that aren't particularly used in Data Science. And since R is the Python alternative, I would say that you shouldn't learn R because it seems 8x less popular than Python (*even though we use R for data analysis at my job right now hahaha*).

The domination of **Python** intrigues me to see if any data role prefers something other than it. Let's take a look at the distribution of different roles among the respondents, then look into what are the languages that each role prefer.


### Distribution of Roles among Egyptian Respondents

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

tmp = egy_df['Q5'].value_counts()[::-1]


ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 100]);
# ax.set_ylim([-1, 13.5]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=8);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 2, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# # Set title
ax.text(12, 16, 'How are Roles Distributed among Respondents?', fontfamily='serif', fontsize=14,
             fontweight='bold', color=title_color)
ax.text(45, 15, 'In Egypt', 
        fontsize=12, fontfamily='serif', color='#4a4a4a');

We can do some cleaning to make this data more comprehensible, as their are some roles with low number of respondents, and they aren't particularly important since we are more interested in Data Scientists.

So we can make **Student** and **Currently not employed** into one category, and also we can merge anything with less than 10 respondents into the **Other** category.

In [None]:
egy_df.loc[:, 'Q5_cleaned'] = egy_df.loc[:, 'Q5'].replace({'Currently not employed': 'Not employed',
                                                          'Student': 'Not employed',
                                                          'Program/Project Manager': 'Other',
                                                          'Developer Relations/Advocacy': 'Other',
                                                          'Product Manager': 'Other',
                                                          'Statistician': 'Other',
                                                          'DBA/Database Engineer': 'Other'})

### Now let's at how languages are utilized by different roles

In [None]:
tmp = egy_df.loc[:, (egy_df.columns.str.startswith('Q7')) | (egy_df.columns.isin(['Q5_cleaned']))]

# Melt filtered data from wide to long format and drop null values
tmp = pd.melt(tmp, id_vars=['Q5_cleaned']).dropna()

# Pivot table to calculate frequency per profession
tmp = pd.pivot_table(tmp, index=['Q5_cleaned'], columns=['value'], aggfunc='size', fill_value=0)

# Normalize all activities per profession
tmp = (tmp.T / tmp.T.sum()).T

fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

sns.heatmap(tmp,
            xticklabels=True,
            yticklabels=True,
            cmap='YlOrBr',
            annot=True,
            linewidths=0.005,
            linecolor='#4a4a4a',
            annot_kws={"fontsize":10},
            fmt='.3f',
            cbar=False)

ax.text(2.2, -1.3, 'Who is using What in Egypt?', fontname='serif', fontsize=25, color=title_color)
ax.text(4, -0.5, 'Programming Languages', fontname='serif', fontsize=15, color="#4a4a4a")
ax.set_ylabel('Role', fontname='serif', fontsize=15, color=subtitle_color)
ax.set_xlabel('Language', fontname='serif', fontsize=15, color=subtitle_color)
plt.xticks(rotation=0, ha='center', fontsize=8)
plt.yticks(fontsize=8)
plt.show()

### Python
#### We can see that in terms of **Python** usage, *Data Scientists* use it the most, followed by *Data Analysts*, *Data Engineers* then *ML Engineers*.

### R
#### The respondents with the highest usage of **R** are *Data Analysts*, where only around 8% of them use it, and the rest of the roles don't exceed 4%.

### SQL
#### *Data Scientists*, *Data Engineers* and *Data Analysts* are the roles that use it **SQL** the most.

### C++, C and Java
#### C++ seems to be prominent around *Data Scientists*, *Data Engineers* and *ML Engineers* which might indicate that these roles are more grounded in Software Engineering than the other roles. And through some digging, I found out that some <a href="https://www.smartdatacollective.com/c-plus-useful-for-data-science-applications/">*Data Scientists*</a> are actually advocating the use of C++, but I have never used it myself, so maybe we should keep an open mind to learning it. 



### Recommended Programming Language for Aspiring Data Scientists

For this question, I'll look into the whole sample of respondents.

In [None]:
egy_rec_lang = egy_df['Q8'].value_counts()

# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 3.5), dpi=140);

ax.bar(egy_rec_lang.index,
       egy_rec_lang,
       color=bar_color,
       edgecolor='#000000',
       width=0.9);

# Increase axis limit
ax.set_ylim([0, 400]);

# Set axes labels
# ax.set_xlabel('Language', fontsize=12);
# ax.set_ylabel('Respondents', fontsize=12);

# Set axes ticks and labels
# ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_xticklabels(labels=egy_rec_lang.index, fontsize=9);
ax.set_yticks([]);


# Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_height(),
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height() + 12),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# Hide spines
for spine in ['top', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.set_title('What programming language would you recommend an aspiring data scientist to learn first?', fontfamily='serif', fontsize=13,
             fontweight='bold', color=title_color)
ax.text(2.5, 380, 'Egyptian Data Professionals Answered', 
        fontsize=12, fontfamily='serif', color='#4a4a4a');

Again **Python** is the first recommendation, so Egypt doesn't break the norm in here. Most probably those who recommend **SQL** and more oriented towards Business Intelligence, while those who recommend **C++** are oriented towards Software Engineering. **R** shows some presence compared to **C++**, but no where compared to **Python** which just shows how unpopular it is in Egypt.

### Favorite IDEs

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

# Calculate programming languages counts Egypt from Q7
egy_ide_count = melt_columns_starting_with(egy_ds_ml_df, 'Q9')['value'].value_counts()[::-1]

ax.barh(egy_ide_count.index,
       egy_ide_count,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
ax.set_xlim([0, 100]);
ax.set_ylim([-1, 13.5]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=egy_ide_count.index, fontsize=6);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 2, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# # Set title
ax.text(-10, 14, 'Which of the following IDE\'s do you use on a regular basis?', fontfamily='serif', fontsize=14,
             fontweight='bold', color=title_color)
ax.text(14, 13, 'Egyptian Data Scientists Answered', 
        fontsize=12, fontfamily='serif', color='#4a4a4a');

### Hosted Notebook Products

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=140);

egy_notebook_count = melt_columns_starting_with(egy_ds_ml_df, 'Q10')['value'].value_counts()[::-1]

ax.barh(egy_notebook_count.index,
       egy_notebook_count,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
ax.set_xlim([0, 75]);
ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=egy_notebook_count.index, fontsize=6);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 1.5, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# # Set title
ax.text(-10, 17,'Which of the following hosted notebook products do you use on a regular basis?', fontfamily='serif', fontsize=14,
             fontweight='bold', color=title_color)
ax.text(17, 16.1, 'Egyptian Data Scientists Answered', 
        fontsize=12, fontfamily='serif', color='#4a4a4a');

### Computing Platforms

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=140);

egy_platforms_count = egy_df['Q11'].value_counts()[::-1]

ax.barh(egy_platforms_count.index,
       egy_platforms_count,
       color=bar_color,
       edgecolor='#000000',
       height=0.9);

# Increase axis limit
# ax.set_xlim([0, 100]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=egy_platforms_count.index, fontsize=8);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 8, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# # Set title
# ax.set_title('Which of the following hosted notebook products do you use on a regular basis?', fontfamily='serif', fontsize=14,
#              fontweight='bold', color='#4a4a4a', loc='left')

ax.text(-85, 6.3, 'What type of computing platform do you use most often for your data science projects?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(70, 5.9, 'Egyptian Data Professionals Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

### Specialized Hardware

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

egy_notebook_count = melt_columns_starting_with(egy_ds_ml_df, 'Q12')['value'].value_counts()[::-1]

ax.barh(egy_notebook_count.index,
       egy_notebook_count,
       color=bar_color,
       edgecolor='#000000',
       height=0.9);

# Increase axis limit
ax.set_xlim([0, 55]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=egy_notebook_count.index, fontsize=7);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 1, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# # Set title
ax.text(0.7, 6.2, 'Which types of specialized hardware do you use on a regular basis?', 
        fontsize=13, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(13.4, 5.8, 'Egyptian Data Scientists Answered', 
        fontsize=11, fontfamily='serif', color='#4a4a4a');

### Usage of TPUs

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 3), dpi=140);

egy_platforms_count = egy_ds_ml_df['Q13'].value_counts()[::-1]

ax.barh(egy_platforms_count.index,
       egy_platforms_count,
       color=bar_color,
       edgecolor='#000000',
       height=0.9);

# Increase axis limit
ax.set_xlim([0, 70]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=egy_platforms_count.index, fontsize=10);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 2, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# # Set title
# ax.set_title('Which of the following hosted notebook products do you use on a regular basis?', fontfamily='serif', fontsize=14,
#              fontweight='bold', color='#4a4a4a', loc='left')

ax.text(-9.7, 5.3, 'Approximately how many times have you used a TPU (tensor processing unit)?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(16, 4.8, 'Egyptian Data Scientists Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

### Data Visualization Tools

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=140);

egy_dataviz_count = melt_columns_starting_with(egy_df, 'Q14')['value'].value_counts()[::-1]

ax.barh(egy_dataviz_count.index,
       egy_dataviz_count,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 90]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=egy_dataviz_count.index, fontsize=8);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 8, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(0, 12.5, 'What data visualization libraries or tools do you use on a regular basis?', 
        fontsize=13, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(85.9, 11.9, 'Egyptian Data Professionals Answered', 
        fontsize=11, fontfamily='serif', color='#4a4a4a');

### Experience with Machine Learning

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

egy_ml_exp_count = egy_ds_ml_df['Q15'].value_counts()[::-1]

ax.barh(egy_ml_exp_count.index,
       egy_ml_exp_count,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 60]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=egy_ml_exp_count.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 1.2, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# # Set title
# ax.set_title('Which of the following hosted notebook products do you use on a regular basis?', fontfamily='serif', fontsize=14,
#              fontweight='bold', color='#4a4a4a', loc='left')

ax.text(0, 7.2, 'For how many years have you used machine learning methods?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(14.3, 6.75, 'Egyptian Data Scientists Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

#### The answer of this question is really weird. The subset that I used for this visualization are suppossedly professional Data Scientists and Machine Learning Engineers, so how can their majority have only used ML methods in the past year?

It could mean that they have only been hired as Data Scientists or ML Engineers in the past year or so, and that's why their real experience with ML methods have barely completed the year.

This goes hand in hand with the assumption that the field is budding, and therefore the majority of the jobs have only been available for these people in the past 1-2 years.

#### And what about these people who don't use ML methods at all? Could they be the Data Scienitists who are mainly involved with the analytics side (dashboards and maybe some statistical inference)? Sadly their number is really low, but we could still look into them to see if we can find anything.

### Machine Learning Frameworks

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=140);

egy_dataviz_count = melt_columns_starting_with(egy_ds_ml_df, 'Q16')['value'].value_counts()[::-1]

ax.barh(egy_dataviz_count.index,
       egy_dataviz_count,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 90]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=egy_dataviz_count.index, fontsize=8);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 1.5, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(-7, 14.7, 'Which of the following machine learning frameworks do you use on a regular basis?', 
        fontsize=13, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(20.9, 14, 'Egyptian Data Scientists Answered', 
        fontsize=11, fontfamily='serif', color='#4a4a4a');

### Machine Learning Algorithms

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=140);

egy_dataviz_count = melt_columns_starting_with(egy_ds_ml_df, 'Q17')['value'].value_counts()[::-1]

ax.barh(egy_dataviz_count.index,
       egy_dataviz_count,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=egy_dataviz_count.index, fontsize=7);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 1.5, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(1, 12.5, 'Which of the following ML algorithms do you use on a regular basis?', 
        fontsize=13, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(24, 11.86, 'Egyptian Data Scientists Answered', 
        fontsize=11, fontfamily='serif', color='#4a4a4a');

### Computer Vision

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 6), dpi=140);

tmp = melt_columns_starting_with(egy_ds_ml_df, 'Q18')['value'].value_counts()[::-1]

ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.9);

# Increase axis limit
# ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 0.7, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(-12, 6, 'Which categories of computer vision methods do you use on a regular basis?', 
        fontsize=20, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(9, 5.7, 'Egyptian Data Scientists Answered', 
        fontsize=15, fontfamily='serif', color='#4a4a4a');

### Natural Language Processing

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

tmp = melt_columns_starting_with(egy_ds_ml_df, 'Q19')['value'].value_counts()[::-1]

ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.9);

# Increase axis limit
# ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 0.3, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(-4.8, 5.1, 'Which of the following natural language processing (NLP) methods do you use on a regular basis?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(4.4, 4.7, 'Egyptian Data Scientists Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

### Current Employer

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

tmp = egy_ds_ml_df['Q20'].value_counts()[::-1]

ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 0.6, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(3, 14.1, 'In what industry is your current employer/contract?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(9.7, 13, 'Egyptian Data Scientists Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

### But what about other Data Professionals? Let's take a look into the proportion of employement of different roles in each sector.

In [None]:
tmp = egy_df.loc[:, (egy_df.columns.str.startswith('Q7')) | (egy_df.columns.isin(['Q5_cleaned']))]

# Melt filtered data from wide to long format and drop null values
tmp = pd.melt(tmp, id_vars=['Q5_cleaned']).dropna()

# Pivot table to calculate frequency per profession
tmp = pd.pivot_table(egy_df, index=['Q5_cleaned'], columns=['Q20'], aggfunc='size', fill_value=0)

# Normalize all activities per profession
tmp = (tmp.T/ tmp.T.sum())

fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

sns.heatmap(tmp,
            xticklabels=True,
            yticklabels=True,
            cmap='YlOrBr',
            annot=True,
            linewidths=0.005,
            linecolor='#4a4a4a',
            annot_kws={"fontsize":10},
            fmt='.3f',
            cbar=False)

ax.text(1.9, -2, 'Who is working Where?', fontname='serif', fontsize=25, color=title_color)
ax.text(3.6, -0.6, 'In Egypt', fontname='serif', fontsize=15, color="#4a4a4a")
ax.set_ylabel('Role', fontname='serif', fontsize=15, color=subtitle_color)
ax.set_xlabel('Language', fontname='serif', fontsize=15, color=subtitle_color)
plt.xticks(rotation=30, ha='right', fontsize=8)
plt.yticks(fontsize=8)
plt.show()

#### First of all, it is obvious how *Data Scientists*, *Data Engineers* and *ML Engineers* are all crammed into fields like **Computers/Technology** and **Academics/Education** and how other fields haven't absorbed **Data Science** at all.

This goes hand in hand with the assumption that I made earlier, about how **Data Science** in Egypt is still premature. Since I don't think that the *Data Professionals* that answered with **Academics/Education** are in working in Data Science departments of Acadameic institutions, but rather working at Data Science educational centers, and so are developing educational materials for Data Science.

### Let's take a look at how this plot fares in the United States for example to see how it actually looks in a mature market. 

In [None]:
us_df = df_2021[(df_2021['Q3'].isin(['USA']))]

tmp = us_df.loc[:, (us_df.columns.str.startswith('Q7')) | (us_df.columns.isin(['Q5']))]

# Melt filtered data from wide to long format and drop null values
tmp = pd.melt(tmp, id_vars=['Q5']).dropna()

# Pivot table to calculate frequency per profession
tmp = pd.pivot_table(egy_df, index=['Q5'], columns=['Q20'], aggfunc='size', fill_value=0)

# Normalize all activities per profession
tmp = (tmp.T/ tmp.T.sum())

fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

sns.heatmap(tmp,
            xticklabels=True,
            yticklabels=True,
            cmap='Blues',
            annot=True,
            linewidths=0.005,
            linecolor='#4a4a4a',
            annot_kws={"fontsize":10},
            fmt='.3f',
            cbar=False)

ax.text(2.9, -2.3, 'Who is working Where?', fontname='serif', fontsize=25, color=title_color)
ax.text(5.4, -0.8, 'In The USA', fontname='serif', fontsize=15, color="#4a4a4a")
ax.set_ylabel('Role', fontname='serif', fontsize=15, color=subtitle_color)
ax.set_xlabel('Language', fontname='serif', fontsize=15, color=subtitle_color)
plt.xticks(rotation=30, ha='right', fontsize=8)
plt.yticks(fontsize=8)
plt.show()

#### We can definetly see the difference in **Data Scientists**, which are far more less concentrated as in Egypt,

### Size of Current Company

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 3), dpi=140);

tmp = egy_ds_ml_df['Q21'].value_counts()[::-1]

ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 1.2, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(2, 5.4, 'What is the size of the company where you are employed?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(19.7, 4.8, 'Egyptian Data Scientists Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

### Number of Data Scientists at Work

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

tmp = egy_ds_ml_df['Q22'].value_counts()[::-1]

ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 0.7, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(-15, 7.5, 'Approximately how many individuals are responsible for data science workloads at your place of business?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(8, 6.9, 'Egyptian Data Scientists Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

### Employer and Machine Learning 

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

tmp = egy_df['Q23'].value_counts()[::-1]

ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 1.5, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(-20, 6.2, 'Does your current employer incorporate machine learning methods into their business?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(22, 5.7, 'Egyptian Data Professionals Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

### Work Activities of each Data Profession

This question can give really good insight about who is called what in Egypt. I've heard numerous anecdotes in Egypt and outside about people who were hired in Data Science positions that listed specific ML skills, but all they ended doing was building dashboards and querying data using SQL.

In [None]:
tmp = egy_df.loc[:, (egy_df.columns.str.startswith('Q24')) | (egy_df.columns.isin(['Q5_cleaned']))]

# Melt filtered data from wide to long format and drop null values
tmp = pd.melt(tmp, id_vars=['Q5_cleaned']).dropna()

# Pivot table to calculate frequency per profession
tmp = pd.pivot_table(tmp, index=['Q5_cleaned'], columns=['value'], aggfunc='size', fill_value=0)

# Normalize all activities per profession
tmp = (tmp.T / tmp.T.sum())

fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

sns.heatmap(tmp,
            xticklabels=True, yticklabels=True, cmap='YlOrBr', annot=True, linewidths=0.005, linecolor='#4a4a4a', annot_kws={"fontsize":12}, fmt='.3f', cbar=False)
ax.text(1, -1.5, 'Who is doing What in Egypt?', fontname='serif', fontsize=30, color=title_color)
ax.text(3, -0.5, 'Work Activities', fontname='serif', fontsize=20, color="#4a4a4a")
ax.set_ylabel('Activities', fontname='serif', fontsize=15, color=subtitle_color)
ax.set_xlabel('Profession', fontname='serif', fontsize=15, color=subtitle_color)
plt.xticks(rotation=30, ha='right', fontsize=10)
plt.yticks(fontsize=10)
plt.show()



#### First of all we can see how similar the roles are in Egypt, where for example *Business Analysts* and *Data Analysts* are practically the same. Also *Data Engineers* and *Data Scientists* seems to be extremely similar. 

### How does this compare to the US?

In [None]:
tmp = us_df.loc[:, (us_df.columns.str.startswith('Q24')) | (us_df.columns.isin(['Q5']))]

# Melt filtered data from wide to long format and drop null values
tmp = pd.melt(tmp, id_vars=['Q5']).dropna()

# Pivot table to calculate frequency per profession
tmp = pd.pivot_table(tmp, index=['Q5'], columns=['value'], aggfunc='size', fill_value=0)

# Normalize all activities per profession
tmp = (tmp.T / tmp.T.sum())

fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

sns.heatmap(tmp,
            xticklabels=True, yticklabels=True, cmap='Blues', annot=True, linewidths=0.005, linecolor='#4a4a4a', annot_kws={"fontsize":12}, fmt='.3f', cbar=False)
ax.text(1, -1.5, 'Who is doing What in The USA?', fontname='serif', fontsize=30, color=title_color)
ax.text(4.8, -0.5, 'Work Activities', fontname='serif', fontsize=20, color="#4a4a4a")
ax.set_ylabel('Activities', fontname='serif', fontsize=15, color=subtitle_color)
ax.set_xlabel('Profession', fontname='serif', fontsize=15, color=subtitle_color)
plt.xticks(rotation=30, ha='right', fontsize=10)
plt.yticks(fontsize=10)
plt.show()

#### So we can see the same thing with *Business Analysts* and *Data Analysts*. However, there is a greater distinction between *Data Engineers* and *Data Scientists*, where *Data Engineers* in here are much more focused on **Building data infrastructure** than *Data Scientists* compared to Egypt. So after all being a *Data Scientist* or *Data Engineer* doesn't seem to matter that much in Egypt according to the comparison between the same two roles in the US

#### Also ML Engineers have more focus on **Expermintation** and **Building ML models** than other roles compared to Egypt.

### Salary in USD

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

tmp = egy_ds_ml_df['Q25'].value_counts()[::-1]

ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 0.9, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(-4, 15.5, 'What is your current yearly compensation (approximate $USD)?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(11, 14.5, 'Egyptian Data Scientists Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

### Computing Platforms

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

tmp = melt_columns_starting_with(egy_ds_ml_df, 'Q27')['value'].value_counts()[::-1]

ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 0.8, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(-20, 13.4, 'Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(7.4, 12.4, 'Egyptian Data Scientists Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

### Most Enjoyable Cloud Computing Platform

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

tmp = melt_columns_starting_with(egy_ds_ml_df, 'Q28')['value'].value_counts()[::-1]

ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 0.1, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(-1.7, 6.5, 'Of the cloud platforms that you are familiar with, which has the best developer experience?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(0.7, 6, 'Egyptian Data Scientists Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

### Data Storage Products

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

tmp = melt_columns_starting_with(egy_ds_ml_df, 'Q30')['value'].value_counts()[::-1]

ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 0.1, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(-1.5, 8.7, 'Do you use any of the following data storage products on a regular basis?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(1, 8.1, 'Egyptian Data Scientists Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

### Managed Machine Learning Products

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

tmp = melt_columns_starting_with(egy_ds_ml_df, 'Q31')['value'].value_counts()[::-1]

ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 0.6, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(-9, 11.9, 'Do you use any of the following managed machine learning products on a regular basis?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(6, 11.1, 'Egyptian Data Scientists Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

### Portfolio 

In [None]:
# Initialize plot figure object
fig, ax = plt.subplots(1, 1, figsize=(10, 4), dpi=140);

tmp = melt_columns_starting_with(egy_df, 'Q39')['value'].value_counts()[::-1]

ax.barh(tmp.index,
       tmp,
       color=bar_color,
       edgecolor='#000000',
       height=0.8);

# Increase axis limit
# ax.set_xlim([0, 85]);
# ax.set_ylim([-1, 17]);


# # Set axes labels
# # ax.set_xlabel('Language', fontsize=12);
# # ax.set_ylabel('Respondents', fontsize=12);

# # Set axes ticks and labels
# # ax.set_xticks(ticks=egy_prog_langs_count.index)
ax.set_yticklabels(labels=tmp.index, fontsize=9);
ax.set_xticks([]);


# # Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_width(),
                xy=(bar.get_width() + 1.5, bar.get_y() + bar.get_height() / 2),
                ha='center',
                va='center',
                size=10,
                color='#000000')
    
    
# # Hide spines
for spine in ['top', 'bottom', 'right', 'left']:
    ax.spines[spine].set_visible(False)
    
# # Hide ticks
ax.tick_params(left=False, bottom=False);

# Set title
ax.text(-19, 8.5, 'Do you use any of the following managed machine learning products on a regular basis?', 
        fontsize=15, fontfamily='serif', color=title_color, fontweight='bold');
ax.text(20, 7.8, 'Egyptian Data Enthusiats Answered', 
        fontsize=13, fontfamily='serif', color='#4a4a4a');

I literally haven't read a blog post about how to build a good Data Science portfolio that didn't recommend sharing your work through blog posts, but it seems that Egyptians don't follow that advice.


# Conclusion

What can we get from this humble analysis? Well, Egypt still has time to get more mature in Data Science, and that's probably good new for all the Egyptian enthusiats here on Kaggle, and who aren't here as well. 

If the government keeps pushing Data Science, and different sectors start appending Data Science into their dictionaries, I think that we might witness some change in the market over the next years.

Anyways, I hope that you have enjoyed this notebook. I know it was really skimmy and not comprehensive at all, but I tried to focus on the things that I was curious about. 

I'll try to make another notebook that offer a more statistical analysis between the difference between countries, and namely Egypt and the USA.

### Thanks for reading

In [None]:
df_2021.iloc[0, 200:].to_dict()

## Work in progress...