In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
file = "/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv"
data = pd.read_csv(file)
data.head()

In [None]:
#to use plotly.express map we need the iso_alpha 3 letter code for every country
unique_countries = data["Q3"].drop(0).unique() #an array
unique_countries = pd.DataFrame(data=unique_countries, columns=["countries"])
#remove other and I do not wish to disclose my location
unique_countries = unique_countries[unique_countries["countries"] != "Other"]
unique_countries = unique_countries[unique_countries["countries"] != "I do not wish to disclose my location"]
#reset index
unique_countries = unique_countries.reset_index().drop(columns="index")
#print(unique_countries.to_string())
#unique_countries = unique_countries[unique_countries["countries"] != "I do not wish to disclose my location"]
#index = np.where(unique_countries=='I do not wish to disclose my location')
#unique_countries = unique_countries=='I do not wish to disclose my location')

isocode = pd.DataFrame(data=["IND","IDN","PAK","MEX","RUS","TUR","AUS","NGA","GRC","BEL","JPN","EGY","SGP","BRA","POL","CHN","IRN","USA","ITA","VNM","ISR",
           "PER","ZAF","ESP","BGD","GBR","FRA","CHE","DZA","TUN","ARG","SWE","COL","CAN","CHL","NLD","UKR","SAU","ROU","MAR","AUT",
           "TWN","KEN","BLR","IRL","PRT","HKG","DNK","DEU","KOR","PHL","LKA","ARE","UGA","GHA","MYS","THA","NPL","KAZ","ETH","IRQ","ECU","NOR","CZE"], columns=["isocode"])
#print(isocode.to_string())

#combine together as dictionary, then into dataframe
map_dict = {"countries":unique_countries["countries"], "isocode":isocode["isocode"]}
map = pd.DataFrame(data=map_dict)
print(map)
#left join map with data
data = pd.merge(data, map, left_on="Q3", right_on="countries", how="left")
#plot all countries involved in survey
plot_data = data[["countries", "isocode"]]
#to use plotly.express map we need the iso_alpha 3 letter code for every country
fig = px.choropleth(plot_data, locations="isocode",
                    #color="",
                    hover_name="countries", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

In [None]:
gender_prop = data[["isocode", "countries", "Q2"]].drop(index=0) #removes index 0 (ie row 1)
#select only male and female
gender_prop = gender_prop[gender_prop["Q2"].isin(["Man", "Woman"])]
#produce counts data
gender_prop = pd.DataFrame(gender_prop.groupby(["isocode", "countries","Q2"])["Q2"].count()).rename(columns={"Q2": "counts"})
#pivot table
gender_prop = pd.pivot_table(data=gender_prop, index=["isocode", "countries"],
                            columns="Q2", values="counts")
#calculate proportions
gender_prop["f_prop"] = round(gender_prop["Woman"]/(gender_prop["Man"]+gender_prop["Woman"])*100)
#turn that column into an integer, to get rid of the decimal.
gender_prop["f_prop"] = gender_prop["f_prop"].astype(int)
#turn multi index into columns
gender_prop = gender_prop.reset_index()
gender_prop.head(n=10)
gender_prop[gender_prop["countries"].isin(["Tunisia", "Saudi Arabia", "Malaysia", "Sri Lanka", "Peru", "Ethiopia"])]

In [None]:
fig2 = px.choropleth(gender_prop, locations="isocode",
                    color="f_prop",
                    hover_name="countries", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)
fig2.show()

No matter how large or small the country, the number of female respondents as a proportion of total respondents per country is still very low, between 10-25%. Countries with less than 100 respondents likely give skewed results due to very low numbers, including Tunisia, which had 39% female respondents (and raw counts of 67 men + 42 women)

In [None]:
print(data.columns)
print(data.values)
print(data.shape)
print(data.index) #number of rows
data['Q3'].unique

Whats the number of men and women? What proportions do we see across different countries?

In [None]:
gender_country = data[["Q2", "Q3", "isocode"]]
display(gender_country)


In [None]:
#select only male and female
gender_country = gender_country[gender_country["Q2"].isin(["Man", "Woman"])]
#pivot table to give counts
gc = pd.pivot_table(data=gender_country, index="Q3", columns="Q2", aggfunc="count")
display(gc)
#add new column showing proportions
#gc["proportion_women"]=gc["Woman"]/(gc["Man"]+gc["Woman"])
gc.columns
#remove isocde as multiindex

In [None]:
gender_country = data[["Q2", "Q3"]]
#value_counts is for series objects, not dataframes
g1 = gender_country["Q3"].value_counts() #count the number of categories in that column #sort=True
#the above is the same as: g1 = gender_country.groupby("Q3").count()
#remove the heading question
g1 = g1.drop("In which country do you currently reside?", axis=0) #drop rows containing that label/name
print(type(g1)) #this is a series object.
g1 = g1.reset_index() #reset index, so that our previous index can be a column now.
#plot number of survey respondents by country
g1df = pd.DataFrame(g1) #turn series object in dataframe
#rename columns
g1df.columns = ["Index", "Count"]

#plot graph
plt.figure(figsize=(20, 4), dpi=80)
graph = sns.barplot(x="Index", y="Count", data=g1df)
graph.set_title(label="Number of respondents by country")
for p in graph.patches: #patches gives me each bar to iterate over.
    graph.annotate(format(p.get_height(), '.0f'), #0 decimal places (ie display whole integer)
                   (p.get_x() + p.get_width() / 2., p.get_height()), #text location is based on the height/width of the bars
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), size=7,
                   textcoords = 'offset points')
plt.xticks(rotation=90)


plt.show()


From australia, there were 264 respondents for the kaggle survey. Im interested to see what the state of data science is in Australia. We definitely dont have as many kaggle respondents. 

In [None]:
#the proportion of highest degrees
high_degrees = data.groupby("Q4")["Q4"].count()
#remove the question
high_degrees = high_degrees.drop('What is the highest level of formal education that you have attained or plan to attain within the next 2 years?')
high_degrees.sort_values(ascending=False).plot(kind="bar", title="Highest level of formal education attained or will attain in next 2 years") #series

#data["Q4"].plot(kind="bar")


Majority of kaggle people obtain either a Masters or a Bachelor's degree

In [None]:
#any comparisons between men and women taking different subjects
high_degree_gender_1 = data[["Q2", "Q4"]]
#remove the question in the first row from the df
high_degree_gender_1 = high_degree_gender_1.drop(0, axis=0)

#australia only view
high_degree_gender_aus = data[["Q2", "Q3", "Q4"]].drop(0, axis=0)
high_degree_gender_aus = high_degree_gender_aus[high_degree_gender_aus["Q3"].isin(["Australia"])] #filter for aus only
high_degree_gender_aus = high_degree_gender_aus[high_degree_gender_aus["Q2"].isin(["Man", "Woman"])] #filter for male, female only
high_degree_gender_aus = high_degree_gender_aus.groupby(["Q2","Q4"])[["Q4"]].count()
high_degree_gender_aus_unstacked = high_degree_gender_aus.rename(columns={"Q4":"count"}).unstack() #rename and unstack, for stacked barplot below

#worldwide view of gender and degrees
high_degree_gender = high_degree_gender_1.groupby(["Q2","Q4"])[["Q4"]].count()
#rename the Q4 column
high_degree_gender = high_degree_gender.rename(columns={"Q4":"count"}).sort_values(by="count", ascending=False)
#select only male and female genders
high_degree_gender_unstacked = high_degree_gender.loc[["Man", "Woman"],:].unstack() #unstack will help plot barplot stacked with multiindex dataframe


#prepare to plot
fig, ax = plt.subplots(1,2, figsize=(20, 4)) #create small subplots, define the size

#plotting multiple index dataframe, use unstack
high_degree_gender_unstacked.plot(kind="bar", title="Similar trends in education preference between men and women worldwide", ax=ax[0]) #set ax at the end
ax[0].legend(bbox_to_anchor=(1, -0.3), fontsize=10)
high_degree_gender_aus_unstacked.plot(kind="bar", title="Trends in education preference between men and women in Australia only", ax=ax[1]) #set ax at the end
ax[1].legend(bbox_to_anchor=(1, -0.3), fontsize=10)


plt.show()

On the worldwide scale, the data shows that Bachelors and Masters degrees are fairly equivalently taken or desired to be taken within next 2 yrs. In Australia however, masters degrees seems to be taken  slightly higher than Bachelors. This may reflect a slower push or emphasis on computer science or data during high school, where students may base their school knowledge/training in their selection of a bachelor's degree. It can also simply reflect that more people are upskilling or making career changes in data science through a masters degree, and at an older age.

In [None]:
#What is the age range for those taking bachelor and masters degrees? does my above theory hold?
age_world = data[["Q1", "Q2", "Q3", "Q4"]]
#q1 - age, q2 gender, q3 country, q4 degree
age_world = age_world.drop(0, axis=0)
#filter for australia only
age_world_aus = age_world[age_world["Q3"].isin(["Australia"])]
#divide into 2 datasets for man and woman
age_world_aus_man = age_world_aus[age_world_aus["Q2"].isin(["Man"])]
age_world_aus_woman = age_world_aus[age_world_aus["Q2"].isin(["Woman"])]
#need counts data
age_world_aus_man_unstacked = age_world_aus_man.groupby(["Q1", "Q4"])[["Q4"]].count().unstack()
age_world_aus_woman_unstacked = age_world_aus_woman.groupby(["Q1", "Q4"])[["Q4"]].count().unstack()

#prepare to plot
fig, ax = plt.subplots(1,2, figsize=(25, 4), sharey=True, sharex=True) #create small subplots, define the size

#plotting multiple index dataframe, use unstack
age_world_aus_man_unstacked.plot(kind="bar", title="Education according to age amongst Australian men", ax=ax[0]) #set ax at the end
ax[0].legend(bbox_to_anchor=(1, -0.3), fontsize=10)
age_world_aus_woman_unstacked.plot(kind="bar", title="Education according to age amongst Australian women", ax=ax[1]) #set ax at the end
ax[1].legend(bbox_to_anchor=(1, -0.3), fontsize=10)


plt.show()

What stands out from this data is that men across all ages, young and old, have a proportion of men with a Bachelor's degree in the field. Australian women generally over 35 have a degree higher than a Bachelors, perhaps suggesting that women feel they need a higher qualifiaction to work in this field, or that their original Bachelors degree was not in data/computer science.

In [None]:
# what is the role and experience (years in coding) of Australians?
#What is the age range for those taking bachelor and masters degrees? does my above theory hold?
role_exp = data[["Q1", "Q2", "Q3", "Q4", "Q5", "Q6"]]
#q1 - age, q2 gender, q3 country, q4 degree, q5 current role, q6 code experience
role_exp = role_exp.drop(0, axis=0)
#filter for australia only
role_exp_aus = role_exp[role_exp["Q3"].isin(["Australia"])]

#divide into 2 datasets for man and woman
role_exp_aus_man = role_exp_aus[role_exp_aus["Q2"].isin(["Man"])]
role_exp_aus_woman = role_exp_aus[role_exp_aus["Q2"].isin(["Woman"])]
#need counts data
role_exp_aus_man = role_exp_aus_man.groupby(["Q1", "Q5"])[["Q5"]].count()
role_exp_aus_woman = role_exp_aus_woman.groupby(["Q1", "Q5"])[["Q5"]].count()
#rename column from Q5
role_exp_aus_man = role_exp_aus_man.rename(columns={"Q5": "count"})
role_exp_aus_woman = role_exp_aus_woman.rename(columns={"Q5": "count"})
#unstack data for plotting
role_exp_aus_man_unstacked = role_exp_aus_man.unstack()
role_exp_aus_woman_unstacked = role_exp_aus_woman.unstack()
#prepare to plot
fig, ax = plt.subplots(2,1, figsize=(25, 10), sharey=True, sharex=False) #create small subplots, define the size

#plotting multiple index dataframe, use unstack
role_exp_aus_man_unstacked.plot(kind="bar", title="Current jobs by age group amongst Australian men", width=1, ax=ax[0]) #set ax at the end
ax[0].legend(bbox_to_anchor=(1,0), loc="center left", fontsize=10)
role_exp_aus_woman_unstacked.plot(kind="bar", title="Current jobs by age group amongst Australian women", width=1, ax=ax[1]) #set ax at the end
ax[1].legend(bbox_to_anchor=(1, 1), fontsize=10)


plt.show()

What is visually striking from these graphs is that there are fewer colours (job diversity) for Australian female kagglers compared to men.

In [None]:
#What is the australian industry like?
aus_industry = data[["Q2", "Q3", "Q20"]].drop(0, axis=0)
aus_industry = aus_industry[aus_industry["Q3"].isin(["Australia"])]
#remove australia column
aus_industry = aus_industry.drop(["Q3"], axis=1)
#select out men and women
aus_industry_men = aus_industry[aus_industry["Q2"].isin(["Man"])]
aus_industry_women = aus_industry[aus_industry["Q2"].isin(["Woman"])]

#counts
aus_industry_men_unstacked = aus_industry_men.groupby(["Q20"])["Q20"].count() #series object
aus_industry_women_unstacked = aus_industry_women.groupby(["Q20"])["Q20"].count() #series object

aus_industry_men_unstacked = aus_industry_men_unstacked.sort_values(ascending=False)
aus_industry_women_unstacked = aus_industry_women_unstacked.sort_values(ascending=False)
#prepare for plotting
fig, ax = plt.subplots(1,2, figsize=(20,4), sharey=True)

aus_industry_men_unstacked.plot(kind="bar", title="Industries where male Australian kagglers work", ax=ax[0])
ax[0].legend(bbox_to_anchor=(1, 1), fontsize=10)
aus_industry_women_unstacked.plot(kind="bar", title="Industries where female Australian kagglers work", ax=ax[1])
ax[1].legend(bbox_to_anchor=(1, 1), fontsize=10)
plt.show()


 We see the top 4 industries for data science/kagglers working in Australia are in the industries of: Academics/education, Computer/Technology, Accounting/Finance, Government/Public service and other. This is similar between men and women, and reflect where data science and machine learning is most utilised in Australia.


What sorts of jobs do people get with:
    <1 year of coding?
    <2 years of coding?

Where do those who have used/completed courses in datacamp go out to? what jobs do they do? what salary.

In [None]:
#find all the column names starting with question 40
data.columns[data.columns.str.contains("Q40")]
#select appropriate columns from df, removing row 1
datacamp = data[["Q2","Q40_Part_4", "countries", "isocode", "Q4", "Q5"]].drop(index=0)
#replace NaN with string none
datacamp = datacamp.replace(np.nan, "none")
#create counts data by gender on usage of datacamp
datacamp = datacamp.groupby(["Q4", "Q2", "Q40_Part_4"])["Q40_Part_4"].count()
datacamp.head(n=10)

# What does the workforce for data science look like in australia?
# look at the size of workforce
# the types of technologies, are they slow on ML?
# who gets to deal with the most advanced stuff?

In [None]:
#any type of gender preference for particular job positions in the world?
gender_job = data[["Q2", "Q5"]]
#remove row 0
gender_job = gender_job.drop(0)
gender_job.head()


An EDA looking at which countries do the best on the gender metrics.
1. You can plot a world map as chloropleth as women/men as a proportion in total respondents from that country. which one had the smallest gap between male and female kaggler respondents
2. Wages
3. skill level. where in the world are women more skilled in ...?

can we eventually rank countries on how well they do?
then draw a conclusion to the best?

In [None]:
#sns.barplot(x="Q2", y="Q5", data=gender_job)
#count
gender_job = gender_job.groupby(["Q2", "Q5"])["Q5"].count() #series
gender_job = pd.DataFrame(gender_job)
gender_job.loc

That preference for bachelor and masters does not differ between genders. Everyone seems to preference bachelors/masters.
Given that a very similar number of people take bachelors or masters, is there anything that differentiates people's careers based on which one they took? Is there any reason to suspect a difference of some sort?
Lets do a comparison between people who study bachelors vs masters as their highest degree

In [None]:
#any job differences between those studying bachelor vs masters
#Q5 - their current job positions

# differenecs in age groups for when they study bachelor vs masters

In [None]:
#gender_country = gender_country.drop(0, axis=1)
gender_country.head()
#gender_country.pivot_table(index="Q3", columns=["Man", "Woman"], aggfunc="count") #help with this! pivot wider

In [None]:
print(gender_country.head())
#is there a country with the most reduced gender gap? ie. does gender gap improve in any particular country?
#Q2= gender, Q3=country
gender_country.groupby(["Q2", "Q3"])["Q3"].count()

In [None]:
#grouped bar chart - you just plot several barplots
female_country = gender_country[gender_country["Q2"] == "Woman"]
fem_count = female_country.groupby("Q3").count()
print(fem_count)
male_country = gender_country[gender_country["Q2"] == "Man"]
male_count = male_country.groupby("Q3").count()
print(male_count)
#add these new columns


#sns.catplot(data=gender_country, kind="bar", x="Q3", y="Q2")

In [None]:
#sum of men and women worldwide
allgender = gender_country.groupby("Q2").count()

allgender = allgender.sort_values(by=["Q3"], axis = 0, ascending = False)
#this is the same: gender_country["Q2"].value_counts()
#remove heading/question
allgender = allgender.drop('What is your gender? - Selected Choice', axis=0)
#turn into dataframe
allgenderdf = pd.DataFrame(allgender)

plt.bar(allgender.index.values, allgender["Q3"])
plt.xticks(rotation = 90)
plt.title('Worldwide survey responses by gender')
plt.show()

type(allgender)

In [None]:
# I am interested in R users, the minority compared to python, in AUSTRALIA
# who are those people who recommend R as a first language? - this subgroup of people
# any gender preference for R/python
# how has R usage changed over time?
# what differences do we see in those self identified as data scientist vs analyst?
# what about research scientists? 
# who uses both R and python? - this sub group of people, that use both, who recommends R, vs python? whats different about those people
# what about australian females - what do they do? australian female research scientists, vs those in the world..