## Survey Methodology:

The 2021 Kaggle DS & ML Survey received 25,973 usable responses from participants in 171
different countries and territories.

You can find the full list of questions and answer choices in the file
"kaggle_survey_2021_answer_choices.pdf".

Responses to multiple choice questions (only a single choice can be selected) were recorded in
individual columns. Responses to multiple selection questions (multiple choices can be selected)
were split into multiple columns (with one column per answer choice).

To ensure response quality, we excluded respondents that were flagged by our survey system as
“Spam” or "Duplicate. We also dropped responses from respondents that spent less than 2
minutes completing the survey, as well as responses from respondents that selected fewer than
15 answer choices in total.

To protect the respondents’ privacy, free-form text responses were not included in the public
survey dataset, and the order of the rows was shuffled (responses are not displayed in
chronological order). Likewise, if a country or territory received less than 50 respondents, we
grouped them into a group named “Other” for the sake of anonymity.

An invitation to participate in the survey was sent to the entire Kaggle community (anyone
opted-in to the Kaggle Email List). The survey was also promoted on the Kaggle website (via
both banners and popups) as well as on the Kaggle Twitter channel.

The survey was live from 09/01/2021 to 10/04/2021. We allowed respondents to complete the
survey at any time during that window.

The survey data was released under a CC 2.0 license:

# 1. Loading Libraries

In [None]:
import pandas as pd # For Data Analysis
import numpy as np # For numerical operations
import matplotlib.pyplot as plt # For Visualization
import seaborn as sns # For Visualization
import folium # For Visualization with Map
import math
import json

plt.style.use("fivethirtyeight")

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# 2. Loading Datasets

In [None]:
df = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")

In [None]:
df.head()

# 3.0 Analyzation of data

## Column Name Change

In [None]:
# Converting the columns to Lower Case and removing space and replacing symbols
df.columns

In [None]:
df.columns = [x.lower().replace(" ","_").replace("(","").replace(")","") for x in df.columns]

In [None]:
df.head()

In [None]:
# Shape of Dataframe
df.shape

In [None]:
df.head()

## Trying to reduce the memory uage

In [None]:
df.iloc[0]

#### Remove the first row, since its a description

In [None]:
df.drop(0, axis=0, inplace=True)

In [None]:
df.head()

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
memory_usage_train = df.memory_usage(deep=True) / 1024 ** 2

In [None]:
memory_usage_train

In [None]:
memory_usage_train.sum()

In [None]:
df = reduce_memory_usage(df=df)

In [None]:
memory_usage_train = df.memory_usage(deep=True) / 1024 ** 2

In [None]:
memory_usage_train.sum()

In [None]:
df.dtypes

In [None]:
df.time_from_start_to_finish_seconds = df.time_from_start_to_finish_seconds.astype('int32')

In [None]:
memory_usage_train = df.memory_usage(deep=True) / 1024 ** 2
memory_usage_train.sum()

# 3.0 Finding Missing Values

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(df.isnull(), cbar=False)

#### Inference: 
There are many null values is present in columns. 
Since we are just going to do analysis we can ignore it for now . 

# 3.1 Time From Start to Finish

In [None]:
total_time_invested = df["time_from_start_to_finish_seconds"]

In [None]:
total_time_invested

In [None]:
times = total_time_invested.values

In [None]:
def convert_sec_to_days(x):
    return x/(60.0 * 60.0 * 24.0)

In [None]:
df["time_from_start_to_finish_seconds"] = df["time_from_start_to_finish_seconds"].apply(convert_sec_to_days)

In [None]:
min(df["time_from_start_to_finish_seconds"])

In [None]:
max(df["time_from_start_to_finish_seconds"])

We can create a new column like time_taken_by_days, for using this valuable data as well 

In [None]:
df["time_taken_by_days"] = df["time_from_start_to_finish_seconds"].apply(lambda x: math.ceil(x))

In [None]:
time_taken_data = df.time_taken_by_days.value_counts().sort_index()

In [None]:
print(time_taken_data)

In [None]:
plt.figure(figsize=(12,5))
plt.plot(time_taken_data.index, time_taken_data.values, 
         color='#8f3a84', linestyle='solid', 
         label="Time Taken")
plt.xlabel("Days")
plt.ylabel("No. of People")
plt.legend()
plt.xticks(rotation=45)
plt.title("How Fast the Survey is completed by each ?")
plt.show()

In [None]:
time_taken_data = df.time_taken_by_days.value_counts(normalize=True).sort_index()

In [None]:
plt.figure(figsize=(12,5))
plt.bar(
    x=time_taken_data.index,
    height=time_taken_data.values
)
for year,value in time_taken_data.items():
    plt.text(x=year , y =value , s=f"{round(value*100, 2)} %" , fontdict=dict(fontsize=9), rotation=90)
plt.xlabel("Days")
plt.ylabel("No. of People")
plt.legend()
plt.xticks(rotation=90)
plt.title("How Fast the Survey is completed by each ?")
plt.show()

#### Inference: 
From the above plot its clear that 98.35% of the people has taken less or equal to a one day . 

# 3.2 Q1 - What is your age (# years)?

In [None]:
df.head()

In [None]:
df.q1.isnull().sum()

In [None]:
# Total no of people by category wise. 
df.q1.value_counts().sort_index()

In [None]:
df.sort_values(by=["q1"], inplace=True)

In [None]:
df.q1.value_counts().plot(kind='pie')

In [None]:
df.q1.value_counts(normalize=True).sort_index().plot(kind='bar')

In [None]:
plt.figure(figsize=(12,5))
sns.countplot("q1", data=df, palette="hls")
plt.ylabel("Count", fontsize=18)
plt.xlabel("Age Category", fontsize=18)
plt.title("Age Category Distribution of Survey", fontsize=20)

In [None]:
df.groupby('time_taken_by_days').q1.value_counts()[1].sort_index()

In [None]:
df.groupby('time_taken_by_days').q1.value_counts()[1].sort_index().plot(kind='bar')

#### Inference: 
From the above plot, its known that people who completes survey within a day is same as whole survey. 

In [None]:
df[df['time_taken_by_days']>1].q1.value_counts().sort_index().plot(kind='bar')
plt.xlabel("Age Category")
plt.ylabel("People Count")

#### Inference: 
From the above plot, its known that people in the age category of 18-21, 22-24, 25-29 has taken more than one day to complete the survey

# 3.3.0 Q2 - What is your gender?

What is your gender?

- Man
- Woman
- Nonbinary
- Prefer not to say
- Prefer to self-describe

In [None]:
# Finding Null Values
df.q2.isnull().sum()

#### Distribution of Gender. 

In [None]:
data = df.q2.value_counts()
print(data)

In [None]:
values = data.values
labels = data.index

title = plt.title('Distribution of Gender')
title.set_ha("left")
plt.gca().axis("equal")
pie = plt.pie(values, startangle=0, autopct="%.1f%%")
plt.legend(pie[0],labels, bbox_to_anchor=(1,0.5), loc="center right", fontsize=10, 
           bbox_transform=plt.gcf().transFigure)
plt.subplots_adjust(left=0.0, bottom=0.1, right=0.45)


In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(12,5))
total = float(df.shape[0])
ax = sns.countplot(x="q2", data=df)
plt.title('Gender Distribution', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

### Both Man and Woman

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(12,5))

# For Man & Woman
data = df[df["q2"].isin(["Man","Woman"])]

total = float(data.shape[0])
ax = sns.countplot(x="q1", data=data, hue='q2')
plt.title('Gender Distribution (Man/Woman)', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

#### From the above graph, we can see there are more male participants than combining all the other Genders. 

# 3.3.1 Comparison of gender with age_category  (considering the major category - Man/Woman )

### For Man

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(12,5))

# For Men
data = df[df["q2"]=="Man"]

total = float(data.shape[0])
ax = sns.countplot(x="q2", data=data, hue="q1")
plt.title('Gender Distribution (Man)', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

### For Woman

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(12,5))


# For Men
data = df[df["q2"]=="Woman"]
total = float(data.shape[0])

ax = sns.countplot(x="q2", data=data, hue="q1")
plt.title('Gender Distribution (Woman)', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

#### Inference: 
By Comparing the above two plots, we can say that <b>more of females were active in ML and DS in the category range [18-21, 22-24, 25-29] than man. </b>

# 3.3.2 Comparison of gender with time taken for the survey completion

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(12,5))

# For Man & Woman
data = df[df["q2"].isin(["Man","Woman"]) & df["time_taken_by_days"].isin(list(range(1,10)))]

total = float(data.shape[0])
ax = sns.countplot(x="time_taken_by_days", data=data, hue='q2')
plt.title('Gender with time taken', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Days")
plt.ylabel("Count")
plt.show()

# 3.4.0 Q3. In which country do you currently reside? 

In which country do you currently reside?
[List of Countries]

In [None]:
df.head(5)

#### Countries count

In [None]:
df.q3.value_counts().sort_values(ascending=False)

In [None]:
df.q3.value_counts().sort_values(ascending=False)[:5].plot(kind='bar')
plt.xlabel("Countries")
plt.ylabel("Count")
plt.legend(["Count"])

From the above plot we can see that more people from <b>India</b> has participated in Kaggle Survey .  

In [None]:
_c = df[df.q3!='Other'].q3.value_counts().sort_values(ascending=False)[:6]
_c.index, _c.values

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(10,5))
total = float(data.shape[0])

ax = sns.barplot(x=_c.index, y=_c.values)
plt.title('Country Distribution', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Countries")
plt.ylabel("Count")
plt.show()

In [None]:
_gender_categories = ["Man", "Woman"]
_countries_list = _c.index

total = float(data.shape[0])

filtered_data = df[ (df["q2"].isin(_gender_categories)) & (df["q3"].isin(_countries_list)) ]

sns.set(style="whitegrid")
plt.figure(figsize=(10,5))

ax = sns.countplot(x="q3", data=filtered_data, hue="q2")
plt.title('Gender Country Distribution', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Countries")
plt.ylabel("Count")
plt.show()

In [None]:
_gender_categories = ["Man", "Woman"]
_countries_list = _c.index[:1]

total = float(data.shape[0])

filtered_data = df[ (df["q2"].isin(_gender_categories)) & (df["q3"].isin(_countries_list)) ]

sns.set(style="whitegrid")
plt.figure(figsize=(14,5))

ax = sns.countplot(x="q1", data=filtered_data, hue="q2")
plt.title('India Age Category/Gender Distribution', fontsize=20)
for p in ax.patches:
    percentage = '{:.3f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("India")
plt.ylabel("Count")
plt.show()

#### From the above graph of india, We can see that, more of the pople in the age category of 18-21 is greater than other categories. 
More young people in india are getting knowledge in the field of Data Science and Machine Learning

# 3.4.1 Visualizing in openstreet Map

In [None]:
# Reading the Geojson file -> consists of Polygon geometries of the world. 
json_file = open("../input/world-polygon-geometry/countries.geo.json", "r")
countries_geojson = json.load(json_file)

In [None]:
# Creating a Separate countries list to compare with the countries present in the data (survey)
countries_from_geojson = []
for item in countries_geojson["features"]:
    countries_from_geojson.append(item.get("properties", {}).get("name"))

In [None]:
# Exploring the geojson features
countries_geojson["features"][0]

In [None]:
print(sorted(countries_from_geojson))

In [None]:
# checking which countries are not in geojson
for _item in df.q3.unique():
    if _item not in countries_from_geojson:
        print(_item)

Here we can see 'Viet Nam' is misspelled in the data, so we can change that. ( We can ignore the others for timebeing - Will update in future once i get the geom data )

In [None]:
# Replacign Vietnam
df.q3.replace({"Viet Nam":"Vietnam"}, inplace=True)

In [None]:
country_df = df.q3.value_counts().rename_axis('Name').reset_index(name='Count')
country_df.head()

In [None]:
# Updating the Geojson Content with df data. 
country_to_count = df.q3.value_counts()



for item in countries_geojson["features"]:
    key = item.get("properties", {}).get("name")
    content = f"Name: {key} | Total Count: {str(country_to_count.get(key, 0))}"
    item['properties']['content'] = content

In [None]:
df_map = folium.Map(
    location=[0,0],
    zoom_start=3
)

tiles = ['stamenwatercolor', 'cartodbpositron', 'openstreetmap', 'stamenterrain']
for tile in tiles:
    folium.TileLayer(tile).add_to(df_map)


choropleth = folium.Choropleth(
    geo_data=countries_geojson,
    key_on='feature.properties.name',
    data=country_df, 
    columns=['Name', 'Count'], 
    fill_color='OrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    highlight=True,
    legend_name="Kaggle Survey"
).add_to(df_map)

folium.LayerControl().add_to(df_map)

choropleth.geojson.add_child(
    folium.features.GeoJsonTooltip(['content'], labels=False,
                                   style=('background-color: grey; color: white;'))
)

df_map

#### Inference : 
From the Above map, we can see the dark areas, where there is no contribution towards the survey. 
India, Most Contributed to the survey. 

Areas like Kazakasthan, Ethiopia, Algeria were the one's who contributed in less number. 


# 3.5.0 Q4. What is the highest level of formal education that you have attained or plan to attain within the next 2 years?

- No formal education past high school
- Some college/university study without earning a bachelor’s degree
- Bachelor’s degree
- Master’s degree
- Doctoral degree
- Professional doctorate
- I prefer not to answer

In [None]:
df.q4.value_counts()

In [None]:
# replacing "Some college/university study without earning a bachelor’s degree" with small word "dropouts"
df.q4.replace({
    "Some college/university study without earning a bachelor’s degree":"dropouts"
}, inplace=True)

In [None]:
df.q4.value_counts().sort_values(ascending=False).plot(kind='bar')
plt.xlabel("Education Qualifications")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.legend(["Count"])

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(10,5))
total = float(df.shape[0])

ax = sns.countplot(x="q4", data=df, order=df.q4.value_counts().sort_values(ascending=False).index)
plt.title('Education Qualifications', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Education Categories")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

#### Inference:
From the above plot, we can see people who Completed/Pursuing <b>Masters Degree and Bachelor's Degree</b> we participated high in the kaggle survey . 

In [None]:
_gender_categories = ["Man", "Woman"]

total = float(data.shape[0])

filtered_data = df[df["q2"].isin(_gender_categories)]

sns.set(style="whitegrid")
plt.figure(figsize=(10,5))

ax = sns.countplot(x="q4", data=filtered_data, hue="q2", order=df.q4.value_counts().sort_values(ascending=False).index)
plt.title('Gender Qualification Distribution', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Qualifications")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

#### Inference:
From the above plot, we can see female counts is in between 1/4 and 1/5 of the male count. 

In [None]:
df.head(2)

In [None]:
df[df.q4=="Bachelor’s degree"].q1.value_counts()

In [None]:
_gender_categories = ["Man", "Woman"]

total = float(data.shape[0])

filtered_data = df[(df.q4=="Bachelor’s degree") & (df["q2"].isin(_gender_categories))]

sns.set(style="whitegrid")
plt.figure(figsize=(10,5))

ax = sns.countplot(x="q1", data=filtered_data, hue="q2", 
                   order=df[df.q4=="Bachelor’s degree"].q1.value_counts().sort_values(ascending=False).index,
                  palette=['#F5B041',"#73C6B6"])
plt.title('Bachelors Degree by Gender/Age Distribution', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Age Category")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

# Maximum Qualification in Each Country

In [None]:
df.groupby(df.q3).q4.value_counts().sort_values(ascending=False)

In [None]:
country_max = {}
country_qual = {}
d = df.groupby(df.q3).q4.value_counts().sort_values(ascending=False)
for key, val in d.items():
    _country = key[0]
    _qualification = key[1]
    if country_max.get(_country) == None:
        country_max[_country] = val
        country_qual[_country] = _qualification
    if val > country_max[_country]:
        country_max = val
        country_qual[_country] = _qualification

In [None]:
country_qual

In [None]:
max_by_qual = pd.DataFrame(data={"Name":country_qual.keys(), "Max":country_qual.values()})
max_by_qual.head()

In [None]:
# Updating the Geojson Content with df data. 

for item in countries_geojson["features"]:
    key = item.get("properties", {}).get("name")
    content = f"{str(country_qual.get(key, 'N/A'))}"
    item['properties']['max_qualification'] = content

In [None]:
df_map = folium.Map(
    location=[0,0],
    zoom_start=3
)

tiles = ['stamenwatercolor', 'cartodbpositron', 'openstreetmap', 'stamenterrain']
for tile in tiles:
    folium.TileLayer(tile).add_to(df_map)


choropleth = folium.Choropleth(
    geo_data=countries_geojson,
    key_on='feature.properties.name',
    data=country_df, 
    columns=['Name', 'Count'], 
    fill_color='BuPu', fill_opacity=0.7, line_opacity=0.5,
    highlight=True,
    legend_name="Kaggle Survey"
).add_to(df_map)

folium.LayerControl().add_to(df_map)

choropleth.geojson.add_child(
    folium.features.GeoJsonTooltip(['max_qualification'], labels=False,
                                   style=('background-color: grey; color: white;'))
)

df_map

# 3.6.0 Q5 Select the title most similar to your current role (or most recent title if retired):

- Business Analyst
- Data Analyst
- Data Engineer
- Data Scientist
- DBA/Database Engineer
- Machine Learning Engineer
- Product Manager
- Program/Project Manager
- Research Scientist
- Software Engineer
- Statistician
- Student
- Currently not employed
- Other

In [None]:
df.q5.value_counts()

## Distribution of Professions

In [None]:
df.q5.value_counts().plot(kind='bar')
plt.xlabel("Profession")
plt.ylabel("Count")
plt.legend(["Count"])

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(10,5))
total = float(data.shape[0])

ax = sns.countplot(x=df.q5, data=df)
plt.title('Profession Distribution', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Professions")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Student                         6804
# Data Scientist                  3616
# Software Engineer               2449
# Other                           2393
# Data Analyst                    2301
# Currently not employed          1986
# Research Scientist              1538
# Machine Learning Engineer       1499

## Distribution of profession by Gender ( Man, Woman)

In [None]:
sns.set(style="whitegrid")
sns.set_palette("flare")
plt.figure(figsize=(10,5))
total = float(data.shape[0])

_selected_professions = ["Student", "Data Scientist", "Software Engineer", "Data Analyst", "Research Scientist",
                        "Machine Learning Engineer"]

data = df[ (df.q2.isin(["Man", "Woman"])) & (df.q5.isin(_selected_professions)) ]

ax = sns.countplot(x=data.q5, data=data, hue=data.q2)
plt.title('Profession Distribution by Gender', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Professions")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

## Exploring Not Employed Category

### For Man

In [None]:
# filtering out by top 10 not employed areas
df[(df.q2=="Man") & (df.q5 == "Currently not employed")].q3.value_counts()[:11]

In [None]:
sns.set(style="whitegrid")
sns.set_palette("flare")
plt.figure(figsize=(10,5))
total = float(data.shape[0])

_selected_professions = ["Currently not employed"]
_selected_gender = ["Man"]
_selected_countries = ["India", "United States of America", "Nigeria", "Japan", "Russia", 
                       "United Kingdom of Great Britain and Northern Ireland", "Indonesia",
                      "Pakistan", "Brazil", "Eqypt"]

data = df[ (df.q2.isin(_selected_gender)) & (df.q5.isin(_selected_professions)) & (df.q3.isin(_selected_countries)) ]

ax = sns.countplot(x=data.q5, data=data, hue=data.q3)
plt.title('Not Currently Employed - Man', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.ylabel("Percentage")
# plt.xticks(rotation=90)
plt.show()

#### inference:
From the above plot, we can see more of people (Not Employed) are from India (Getting there handson with Kaggle to reach other professions)

## Woman

In [None]:
# filtering out by top 10 not employed areas
df[(df.q2=="Woman") & (df.q5 == "Currently not employed")].q3.value_counts()[:11]

In [None]:
sns.set(style="whitegrid")
sns.set_palette("flare")
plt.figure(figsize=(10,5))
total = float(data.shape[0])

_selected_professions = ["Currently not employed"]
_selected_gender = ["Woman"]
_selected_countries = ["India", "United States of America", "Egypt","Nigeria", "Russia", 
                       "Bangladesh", "Canada","Singapore","Kenya","Spain"]

data = df[ (df.q2.isin(_selected_gender)) & (df.q5.isin(_selected_professions)) & (df.q3.isin(_selected_countries)) ]

ax = sns.countplot(x=data.q5, data=data, hue=data.q3)
plt.title('Not Currently Employed - Woman', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.ylabel("Percentage")
# plt.xticks(rotation=90)
plt.show()

#### inference:
From the above plot, we can see more of people (Not Employed) are from India (Getting there handson with Kaggle to reach other professions). 

The first 2 places of Unemployed Category (India, USA) are same for both Man and Woman. From the third place its changing for them. 

## Major Occupation in each Country

In [None]:
df.groupby(df.q3).q5.value_counts().sort_values(ascending=False)

In [None]:
country_max = {}
country_prof = {}
d = df.groupby(df.q3).q5.value_counts().sort_values(ascending=False)
for key, val in d.items():
    _country = key[0]
    _profession = key[1]
    if country_max.get(_country) == None:
        country_max[_country] = val
        country_prof[_country] = _profession
    if val > country_max[_country]:
        country_max = val
        country_prof[_country] = _profession

In [None]:
country_prof.keys()

In [None]:
max_by_prof = pd.DataFrame(data={"Name":country_prof.keys(), "Max":country_prof.values()})
max_by_prof.head()

In [None]:
# Updating the Geojson Content with df data. 

for item in countries_geojson["features"]:
    key = item.get("properties", {}).get("name")
    content = f"{str(country_prof.get(key, ''))}"
    item['properties']['max_profession'] = content

In [None]:
countries_geojson["features"][4]

In [None]:
max_by_prof.isnull().sum()

In [None]:
df_map = folium.Map(
    location=[0,0],
    zoom_start=3
)

tiles = ['stamenwatercolor', 'cartodbpositron', 'openstreetmap', 'stamenterrain']
for tile in tiles:
    folium.TileLayer(tile).add_to(df_map)


choropleth = folium.Choropleth(
    geo_data=countries_geojson,
    key_on='feature.properties.name',
    data=country_df, 
    columns=['Name', 'Count'], 
    fill_color='OrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    highlight=True,
    legend_name="Kaggle Survey"
).add_to(df_map)

folium.LayerControl().add_to(df_map)

choropleth.geojson.add_child(
    folium.features.GeoJsonTooltip(['max_profession'], labels=False,
                                   style=('background-color: grey; color: white;'))
)

df_map

# 3.7.0 Q6 For how many years have you been writing code and/or programming?

- I have never written code
- < 1 years
- 1-2 years
- 3-5 years
- 5-10 years
- 10-20 years
- 20+ years

In [None]:
df.q6.value_counts()

In [None]:
df.q6.value_counts().sort_values(ascending=False).plot(kind='bar')
plt.xlabel("Coding Experience")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.legend(["Count"])

In [None]:
_gender_categories = ["Man", "Woman"]

filtered_data = df[df["q2"].isin(_gender_categories)]
total = float(filtered_data.shape[0])

sns.set(style="whitegrid")
plt.figure(figsize=(10,5))

ax = sns.countplot(x="q6", data=filtered_data, hue="q2", order=df.q6.value_counts().sort_values(ascending=False).index)
plt.title('Experience Distribution', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.xlabel("Qualifications")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

Do you liked it!!! Press the <b>Upvote</b> button. 