In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import plotly.graph_objects as go
import warnings

warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Have the countries of the Middle East been able to progress with the global trend of data science?

Hi, I'm Sina, I'm a Data Science Researcher living in the Middle East right now.  

**I tried to have a short analysis, because I think a short story is better read than a long one.**

My main goal in producing this notebook was to answer a question:
**Have the countries of the Middle East been able to progress with the global trend of data science?**

In my opinion, the two main factors preventing these countries from growing in line with
developed countries are the **lack of economic security and the existence of a lot of oil** in this region.

The lack of economic security and the existence of oil-dependent economies in these countries have prevented new jobs and sciences from growing.

In this notebook I try to check that:
**What is the status of data professionals as part of the advancement of new technologies in these countries?**

**So let's go!**

In [None]:
df = pd.read_csv(r"../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv")
Question = df[0:1]
df = df[1:]

df = df.replace({"Iran, Islamic Republic of...": "Iran", "United Arab Emirates":"UAE","United States of America":"USA"})

df = df.rename(columns={"Time from Start to Finish (seconds)":"Time",
                   "Q1":"Age","Q2":"Gender","Q2_OTHER_TEXT":"Gender_Text","Q3":"Country",
                   "Q4":"Education","Q5":"Job Title" , "Q6":"Coding experience"})

df_MiddleEast = df[df.Country.isin(["Iran","Saudi Arabia","Israel","Germany",
                                    "Egypt","UAE","USA","Turkey"])]




df_MiddleEast_Full = df_MiddleEast.copy()
df_MiddleEast = df_MiddleEast.iloc[:,0:7]
df_MiddleEast["Count"] = 1

> In this analysis, I will compare the **United States and Germany as developed countries** with the countries under study so that we can better understand the situation in the Middle East.

In [None]:
df_MiddleEast_Count = df_MiddleEast.groupby("Country").sum()
df_MiddleEast_Count = df_MiddleEast_Count.sort_values("Count")

fig = px.bar(df_MiddleEast_Count,x=df_MiddleEast_Count.index, y="Count")

fig.update_layout(
    title='Count of kagglers')

**As expected**, we probably all guessed that there were more kagglers in developed countries than in any other country

In [None]:
df_MiddleEast_Gender = df_MiddleEast.groupby(["Country" , "Gender"], as_index=False).sum()

for i in df_MiddleEast_Gender.Country.unique():
    pi = df_MiddleEast_Gender[df_MiddleEast_Gender.Country == i]
    me = pi.Count.sum()
    pi.Count = pi.Count/me
    
    df_MiddleEast_Gender.drop(pi.index, axis=0, inplace=True)
    df_MiddleEast_Gender = pd.concat([df_MiddleEast_Gender,pi])
    
    
df_MiddleEast_Gender_Womans = df_MiddleEast_Gender[df_MiddleEast_Gender.Gender == "Woman"].sort_values(by="Count")    
df_MiddleEast_Gender_Womans = df_MiddleEast_Gender_Womans.rename(columns={"Count":"Percentage of women in total"})
fig = px.bar(df_MiddleEast_Gender_Womans,x="Country",y="Percentage of women in total")


fig.update_layout(
    title='Percentage of women in total in each country')

fig.show()

**Oops! It's amazing!**                                                                                 
**Iran has the highest percentage of women compared to other genders in this field!**                                                       
I really did not expect the countries of the Middle East to do even better than the developed countries in this area!

In [None]:
df_MiddleEast_Age = df_MiddleEast.groupby(["Country" , "Age"], as_index=False).sum()

for i in df_MiddleEast_Age.Country.unique():
    pi = df_MiddleEast_Age[df_MiddleEast_Age.Country == i]
    me = pi.Count.sum()
    pi.Count = pi.Count/me
    
    df_MiddleEast_Age.drop(pi.index, axis=0, inplace=True)
    df_MiddleEast_Age = pd.concat([df_MiddleEast_Age,pi])
    
Highlight = {"Germany":"#D59E00","USA":"darkred" , "Egypt":"#73731D","Turky":"FF3333",
             "UAE":"#9800D5" , "Iran":"#FF1313",
             "Saudi Arabia":"#46D200","Israel":"#33DDFF", "Turkey":"#006A8C"}


fig = go.Figure()

for i in df_MiddleEast_Age.Country.unique():
    color = Highlight.get(i, 'grey')
    data = df_MiddleEast_Age[df_MiddleEast_Age.Country == i]
    name = data.Country.iloc[-1]
    fig.add_trace(go.Scatter(x = data.Age , y=data.Count, opacity=1 if color=="#D59E00" or color=="darkred" else 0.4,name = name,
                             line=dict(color = color, width=4 if color=="#D59E00" or color=="darkred" else 1))) 
    
    

fig.update_layout(plot_bgcolor='white')
fig.update_xaxes(showgrid=True, gridwidth=0.2, gridcolor='#EEE1FA')
fig.update_yaxes(showgrid=True, gridwidth=0.2, gridcolor='#EEE1FA')
fig.update_layout(
    title='Age?')
fig.show()

There is no definitive analysis in this plot,                                            but I feel that Germany and the United States have gone through a **more continuous process** than other countries.

This chart might seem a bit crowded! You can make the plot more secluded with the method shown in the GIF below.   
**(I made the GIF myself, I hope it is useful)**

<img src="https://media.giphy.com/media/L5FlRwGKEi4iNWQkrr/giphy.gif">


In [None]:
exp = ['I have never written code', '< 1 years' , '1-2 years', '3-5 years' ,'5-10 years','10-20 years', '20+ years']
cat_dtype = pd.api.types.CategoricalDtype(categories=exp, ordered=True)
df_MiddleEast["Coding experience"]  = df_MiddleEast["Coding experience"].astype(cat_dtype)

df_MiddleEast_Coding = df_MiddleEast.groupby(["Country" , "Coding experience"], as_index=False).sum()

for i in df_MiddleEast_Coding.Country.unique():
    pi = df_MiddleEast_Coding[df_MiddleEast_Coding.Country == i]
    me = pi.Count.sum()
    pi.Count = pi.Count/me
    
    df_MiddleEast_Coding.drop(pi.index, axis=0, inplace=True)
    df_MiddleEast_Coding = pd.concat([df_MiddleEast_Coding,pi])


fig = go.Figure()

for i in df_MiddleEast_Coding.Country.unique():
    color = Highlight.get(i, 'grey')
    data = df_MiddleEast_Coding[df_MiddleEast_Coding.Country == i]
    name = data.Country.iloc[-1]
    fig.add_trace(go.Scatter(x = data["Coding experience"] , y=data.Count, opacity=1 if color=="#D59E00" or color=="darkred" else 0.4,
                             name = name,
                             line=dict(color = color, width=4 if color=="#D59E00" or color=="darkred" else 1))) 
    
    

fig.update_layout(plot_bgcolor='white')
fig.update_xaxes(showgrid=True, gridwidth=0.2, gridcolor='#EEE1FA')
fig.update_yaxes(showgrid=True, gridwidth=0.2, gridcolor='#EEE1FA')

fig.update_layout(title='Coding experience?')

fig.show()

**Pay attention to the right side of the chart!**

I feel significant information is shown!                                                                                
developer with high experience are less in developing countries than in developed countries (except for Israel, which we know is different from other countries in the region!)

In [None]:
df_MiddleEast1 = df_MiddleEast.copy()
df_MiddleEast1 = df_MiddleEast1.dropna()

df_MiddleEast1 = df_MiddleEast1[(df_MiddleEast1["Job Title"]!= "Other") & (df_MiddleEast1["Job Title"]!= "Currently not employed")]
df_MiddleEast1["Job Title"][(df_MiddleEast1["Job Title"] == "Product/Project Manager") | (df_MiddleEast1["Job Title"] == "Business Analyst")] = "Product/Project Manager or BA"
df_MiddleEast1["Job Title"][(df_MiddleEast1["Job Title"] == "Research Scientist") | (df_MiddleEast1["Job Title"] == "Statistician")] = "Statistician or Research Scientist"
df_MiddleEast1["Job Title"][(df_MiddleEast1["Job Title"] == "DBA/Database Engineer") | (df_MiddleEast1["Job Title"] == "Data Engineer")] = "Data Engineer or DBA"

df_MiddleEast_JobTitle = df_MiddleEast1.groupby(["Country" , "Job Title"], as_index=False).sum()


figure = go.Figure()

# i=0
# j=[False,True,False,False,False,False,True]

for country in df_MiddleEast_JobTitle.Country.unique():
    
    color = Highlight.get(country)
#    highlight = True
#     i+=1
    
    plot_data = df_MiddleEast_JobTitle[df_MiddleEast_JobTitle.Country == country]
    axis = plot_data["Job Title"].tolist()
    axis.append(axis[0])
    plot_data = plot_data.Count.tolist()
    plot_data = (np.array(plot_data) / sum(plot_data) * 100).tolist()
    plot_data.append(plot_data[0])  
    
    figure.add_trace(go.Scatterpolar(r=plot_data, theta=axis,mode='lines',line_color=color,showlegend=True,name=country,
            hovertemplate='%{r:0.0f}%',
            opacity=0.8 if color=="#D59E00" or color=="darkred" else 0.3,
            line_shape='spline',line_smoothing=0.6,
            line_width=2 if color=="#D59E00" or color=="darkred" else .7))
    
    
    
figure.update_layout(polar_bgcolor='white',  polar_radialaxis_visible=True,  polar_radialaxis_showticklabels=True,
    polar_radialaxis_tickfont_color='darkgrey',  polar_angularaxis_color='grey',
    polar_angularaxis_showline=False, polar_radialaxis_showline=False, 
    polar_radialaxis_layer='below traces',polar_radialaxis_gridcolor='#F2F2F2',
    polar_radialaxis_range=(0,37), polar_radialaxis_tickvals=[20, 30], 
    polar_radialaxis_ticktext=['20%', '30%'],polar_radialaxis_tickmode='array',title='Job Title?'
)

figure.show()

**In this chart it is quite clear that the abundance of expertise is much better distributed in developed countries.**

This imbalance in the abundance of specialties can be a negative factor for the advancement of data science in developing countries!

**In this plot, you can also make changes to get a better view. 
Again, I made a gif that I shared with you below.**

<img src="https://media.giphy.com/media/KI8yUQt1cL2A0BLMTi/giphy.gif">


In [None]:
df_MiddleEast2 = df_MiddleEast.copy()
df_MiddleEast2 = df_MiddleEast2.dropna()

df_MiddleEast2 = df_MiddleEast2[(df_MiddleEast2["Education"]!= 'I prefer not to answer')]
df_MiddleEast2["Education"][(df_MiddleEast2["Education"].isin(['Professional degree',
                                                               'Some college/university study without earning a bachelor’s degree',
                                                               'No formal education past high school']))] = "Other"


df_MiddleEast_education = df_MiddleEast2.groupby(["Country" , "Education"], as_index=False).sum()

exp = [ 'Other' , 'Bachelor’s degree', 'Master’s degree', 'Doctoral degree']
cat_dtype = pd.api.types.CategoricalDtype(categories=exp, ordered=True)
df_MiddleEast_education["Education"]  = df_MiddleEast_education["Education"].astype(cat_dtype)
df_MiddleEast_education = df_MiddleEast_education.sort_values("Education")


figure = go.Figure()

# i=0
# j=[False,True,False,False,False,False,True]

for country in df_MiddleEast_education.Country.unique():
    
    color = Highlight.get(country)
#    highlight = True
#     i+=1
    
    plot_data = df_MiddleEast_education[df_MiddleEast_education.Country == country]
    axis = plot_data["Education"].tolist()
    axis.append(axis[0])
    plot_data = plot_data.Count.tolist()
    plot_data = (np.array(plot_data) / sum(plot_data) * 100).tolist()
    plot_data.append(plot_data[0])  
    
    figure.add_trace(go.Scatterpolar(r=plot_data, theta=axis,mode='lines',line_color=color,showlegend=True,name=country,
            hovertemplate='%{r:0.0f}%',
            opacity=0.8 if color=="#D59E00" or color=="darkred" else 0.3,
            line_shape='spline',line_smoothing=0.6,
            line_width=2 if color=="#D59E00" or color=="darkred" else .7))
    
    
    
figure.update_layout(polar_bgcolor='white',  polar_radialaxis_visible=True,  polar_radialaxis_showticklabels=True,
    polar_radialaxis_tickfont_color='darkgrey',  polar_angularaxis_color='grey',
    polar_angularaxis_showline=False, polar_radialaxis_showline=False, 
    polar_radialaxis_layer='below traces',polar_radialaxis_gridcolor='#F2F2F2',
    polar_radialaxis_range=(0,60), polar_radialaxis_tickvals=[20, 40], 
    polar_radialaxis_ticktext=['20%', '40%'],polar_radialaxis_tickmode='array',title='level of formal education?'
)

figure.show()

As you can see in the chart above, all countries (Except for Egypt and to some extent Turkey) are shown similarly.

but In Egypt, most people with a bachelor's degree are interested in the field of data.

In [None]:
Language = df_MiddleEast_Full.iloc[:,7:18]
colname ={}
for i in Language.columns:
    colname[i] = Language[i].dropna().unique()[0]
    
Language = Language.rename(columns = colname)
Language[~Language.iloc[:,:].isna()] = 1
Language = Language.join(df["Country"] , lsuffix='_caller', rsuffix='_other')

Language_group = Language.groupby("Country").sum()



#Language_group = Language_group.rename(index=dict_ind)
#res = Language_group.div(Language_group.sum(axis=1), axis=0)
# Language_group = res.reset_index()
# Language_group.rename(columns={"level_0":"Country"})

res = Language_group.div(Language_group.sum(axis=1), axis=0)


fig = go.Figure(data=go.Heatmap(
        z=res.values,
        x=res.columns,
        y=res.index,
        colorscale=["black","darkred", "yellow", "green"]))

fig.update_layout(
    title='which programming languages?')

fig.show()


**Yeah! It could have been predicted!**

C and C++ is more alive in developing countries and in opposite Bash is more alive in developed countries!

In [None]:
Courses = df_MiddleEast_Full.iloc[:,231:242]

colname2 ={}
for i in Courses.columns:
    colname2[i] = Courses[i].dropna().unique()[0]
    
Courses = Courses.rename(columns = colname2)
Courses[~Courses.iloc[:,:].isna()] = 1
Courses = Courses.join(df["Country"] , lsuffix='_caller', rsuffix='_other')

Courses_group = Courses.groupby("Country").sum()



#Language_group = Language_group.rename(index=dict_ind)
#res = Language_group.div(Language_group.sum(axis=1), axis=0)
# Language_group = res.reset_index()
# Language_group.rename(columns={"level_0":"Country"})

Res = Courses_group.div(Courses_group.sum(axis=1), axis=0)


fig = go.Figure(data=go.Heatmap(
        z=Res.values,
        x=Res.columns,
        y=Res.index,
        colorscale=["black","darkred", "yellow", "green"]))

fig.update_layout( autosize=False,
    width=950,
    height=600,
    title='On which platforms?')
fig.show()


**Udacity and Udemy seem to be popular in the Middle East!**                      
in Turkey udemy is most popular platform!

**Well, that was my first analize about this poll**

It was clear from this analize that Middle Eastern countries had **very good growth in some areas**.                                                 Certainly the people of these countries (and not necessarily governments) have realized that the **world will need a new oil called data** in the coming years, but there are still challenges such as **insufficient programming experience**, **lack of updates in some areas** and **a lack of expertise**.There are countries.

**Hope for the progress of the whole world and peace!**


**Update 1: Turkey added (I apologize to my Turkish friends who were not present in the analysis of the first version)**

**The address of the notebooks that I read, I got ideas from them to make this notebook(You must also read it)**:                                               
[How to Create Award Winning Data Visualizations](https://www.kaggle.com/andresionek/how-to-create-award-winning-data-visualizations)                                           
[⚜️ The Hitchhiker's Guide to the Kaggle](https://www.kaggle.com/subinium/the-hitchhiker-s-guide-to-the-kaggle)