In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import json
import pycountry
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Introduction

The workbook contains comprehensive analysis on the overall data to point out behavioural analysis of the participants of Kaggle Survey 2020. Each sections hold detailed overview and some key takeaway points that can be in future be used to change some ongoing procedures or outreach more people.

## Data Input

1. Main Dataset for this analysis is 'kaggle_survey_2020_responses.csv' which has been provided inside the Kaggle Competition.
2. To analyze further, additional dataset for country region has been used, which is a public dataset: 'https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv'

In [None]:
kaggle_survey = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')
kaggle_survey.head()

In [None]:

country_data = pd.read_csv('/kaggle/input/country-regioncsv/country_region.csv')
country_data.head(5)

In [None]:
country_data_1= country_data.drop(country_data.iloc[:,1:4].columns, axis = 1) 
country_data_1

In [None]:
country_data_1= country_data_1.drop(country_data_1.iloc[:,3:6].columns, axis = 1) 
country_data_1

# Data Cleaning

First and foremost, I will clean the survey data provided. There are around 355 columns from which I will reduce all the information to  new 76 columns by aggregating the answers provided. Too many columns sometimes takes away the focus from the analysis and there are always chances of missing some valuable information. However the complete data cleaning process includes the following:

- Aggregate additional columns into string agg to reduce number of columns and combine all informations.
- If the aggregation column is null then change the null with "None"
- Add the count of each chosen options inside the aggregations. For eg. One column will state "regular preferred IDEs" and another column will state "count of regular preferred IDEs".
- If the count of each chosen option inside aggregation includes "None", change the counts to 0.
- Rename the remaining columns without aggregation.
- If the remaining columns without aggregation is null then change the null with "None"
- Add additional columns needed

In [None]:
kaggle_survey=kaggle_survey.rename(columns=kaggle_survey.iloc[0]).drop(kaggle_survey.index[0])
kaggle_survey.head()

In [None]:
kaggle_survey["Duration (in seconds)"] = kaggle_survey["Duration (in seconds)"].astype(int)

In [None]:
kaggle_survey_obj = kaggle_survey.select_dtypes(['object'])
kaggle_survey[kaggle_survey_obj.columns] = kaggle_survey_obj.apply(lambda x: x.str.rstrip())

In [None]:
kaggle_survey.index = pd.RangeIndex(start=0, stop=20036, step=1)

In [None]:
kaggle_survey["regular_programming_language"] = kaggle_survey.iloc[:,8:20].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_programming_language"] = kaggle_survey["regular_programming_language"].replace("", "None")

In [None]:
kaggle_survey["regular_programming_language_count"]= kaggle_survey["regular_programming_language"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_programming_language_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_programming_language"] == 'None' else x["regular_programming_language_count"], axis = 1)

In [None]:
kaggle_survey["regular_ide"] = kaggle_survey.iloc[:,22:33].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_ide"] = kaggle_survey["regular_ide"].replace("", "None")

In [None]:
kaggle_survey["regular_ide_count"]= kaggle_survey["regular_ide"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_ide_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_ide"] == 'None' else x["regular_ide_count"], axis = 1)

In [None]:
kaggle_survey["regular_notebook"] = kaggle_survey.iloc[:,34:47].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_notebook"] = kaggle_survey["regular_notebook"].replace("", "None")

In [None]:
kaggle_survey["regular_notebook_count"]= kaggle_survey["regular_notebook"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_notebook_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_notebook"] == 'None' else x["regular_notebook_count"], axis = 1)

In [None]:
kaggle_survey["specialized_hardware"] = kaggle_survey.iloc[:,49:52].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["specialized_hardware"] = kaggle_survey["specialized_hardware"].replace("", "None")

In [None]:
kaggle_survey["specialized_hardware_count"]= kaggle_survey["specialized_hardware"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["specialized_hardware_count"] = kaggle_survey.apply(lambda x: 0 if x["specialized_hardware"] == 'None' else x["specialized_hardware_count"], axis = 1)

In [None]:
kaggle_survey["regular_data_visualization_library"] = kaggle_survey.iloc[:,54:65].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_data_visualization_library"] = kaggle_survey["regular_data_visualization_library"].replace("", "None")

In [None]:
kaggle_survey["regular_data_visualization_library_count"]= kaggle_survey["regular_data_visualization_library"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_data_visualization_library_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_data_visualization_library"] == 'None' else x["regular_data_visualization_library_count"], axis = 1)

In [None]:
kaggle_survey["regular_ml_framework"] = kaggle_survey.iloc[:,67:82].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_ml_framework"] = kaggle_survey["regular_ml_framework"].replace("", "None")

In [None]:
kaggle_survey["regular_ml_framework_count"]= kaggle_survey["regular_ml_framework"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_ml_framework_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_ml_framework"] == 'None' else x["regular_ml_framework_count"], axis = 1)

In [None]:
kaggle_survey["regular_ml_algorithm"] = kaggle_survey.iloc[:,83:94].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_ml_algorithm"] = kaggle_survey["regular_ml_algorithm"].replace("", "None")

In [None]:
kaggle_survey["regular_ml_algorithm_count"]= kaggle_survey["regular_ml_algorithm"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_ml_algorithm_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_ml_algorithm"] == 'None' else x["regular_ml_algorithm_count"], axis = 1)

In [None]:
kaggle_survey["regular_category_of_computer_vision_method"] = kaggle_survey.iloc[:,95:101].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_category_of_computer_vision_method"] = kaggle_survey["regular_category_of_computer_vision_method"].replace("", "None")

In [None]:
kaggle_survey["regular_category_of_computer_vision_method_count"]= kaggle_survey["regular_category_of_computer_vision_method"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_category_of_computer_vision_method_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_category_of_computer_vision_method"] == 'None' else x["regular_category_of_computer_vision_method_count"], axis = 1)

In [None]:
kaggle_survey["regular_nlp_method"] = kaggle_survey.iloc[:,102:107].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_nlp_method"] = kaggle_survey["regular_nlp_method"].replace("", "None")

In [None]:
kaggle_survey["regular_nlp_method_count"]= kaggle_survey["regular_nlp_method"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_nlp_method_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_nlp_method"] == 'None' else x["regular_nlp_method_count"], axis = 1)

In [None]:
kaggle_survey["important_activities_at_work"] = kaggle_survey.iloc[:,111:118].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["important_activities_at_work"] = kaggle_survey["important_activities_at_work"].replace("", "None")

In [None]:
kaggle_survey["important_activities_at_work_count"]= kaggle_survey["important_activities_at_work"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["important_activities_at_work_count"] = kaggle_survey.apply(lambda x: 0 if x["important_activities_at_work"] == 'None' else x["important_activities_at_work_count"], axis = 1)

In [None]:
kaggle_survey["regular_cloud_computing_platforms"] = kaggle_survey.iloc[:,121:132].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_cloud_computing_platforms"] = kaggle_survey["regular_cloud_computing_platforms"].replace("", "None")

In [None]:
kaggle_survey["regular_cloud_computing_platforms_count"]= kaggle_survey["regular_cloud_computing_platforms"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_cloud_computing_platforms_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_cloud_computing_platforms"] == 'None' else x["regular_cloud_computing_platforms_count"], axis = 1)

In [None]:
kaggle_survey["regular_cloud_computing_products"] = kaggle_survey.iloc[:,133:144].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_cloud_computing_products"] = kaggle_survey["regular_cloud_computing_products"].replace("", "None")

In [None]:
kaggle_survey["regular_cloud_computing_products_count"]= kaggle_survey["regular_cloud_computing_products"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_cloud_computing_products_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_cloud_computing_products"] == 'None' else x["regular_cloud_computing_products_count"], axis = 1)

In [None]:
kaggle_survey["regular_ml_products"] = kaggle_survey.iloc[:,145:155].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_ml_products"] = kaggle_survey["regular_ml_products"].replace("", "None")

In [None]:
kaggle_survey["regular_ml_products_count"]= kaggle_survey["regular_ml_products"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_ml_products_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_ml_products"] == 'None' else x["regular_ml_products_count"], axis = 1)

In [None]:
kaggle_survey["regular_big_data_products"] = kaggle_survey.iloc[:,156:173].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_big_data_products"] = kaggle_survey["regular_big_data_products"].replace("", "None")

In [None]:
kaggle_survey["regular_big_data_products_count"]= kaggle_survey["regular_big_data_products"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_big_data_products_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_big_data_products"] == 'None' else x["regular_big_data_products_count"], axis = 1)

In [None]:
kaggle_survey["regular_bi_tools"] = kaggle_survey.iloc[:,175:189].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_bi_tools"] = kaggle_survey["regular_bi_tools"].replace("", "None")

In [None]:
kaggle_survey["regular_bi_tools_count"]= kaggle_survey["regular_bi_tools"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_bi_tools_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_bi_tools"] == 'None' else x["regular_bi_tools_count"], axis = 1)

In [None]:
kaggle_survey["is_regular_automated_ml_tools"] = kaggle_survey.iloc[:,191:198].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_automated_ml_tools"] = kaggle_survey.iloc[:,199:210].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["regular_automated_ml_tools"] = kaggle_survey["regular_automated_ml_tools"].replace("", "None")

In [None]:
kaggle_survey["regular_automated_ml_tools_count"]= kaggle_survey["regular_automated_ml_tools"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["regular_automated_ml_tools_count"] = kaggle_survey.apply(lambda x: 0 if x["regular_automated_ml_tools"] == 'None' else x["regular_automated_ml_tools_count"], axis = 1)

In [None]:
kaggle_survey["tools_to_manage_ml_experiments"] = kaggle_survey.iloc[:,211:221].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["tools_to_manage_ml_experiments"] = kaggle_survey["tools_to_manage_ml_experiments"].replace("", "None")

In [None]:
kaggle_survey["tools_to_manage_ml_experiments_count"]= kaggle_survey["tools_to_manage_ml_experiments"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["tools_to_manage_ml_experiments_count"] = kaggle_survey.apply(lambda x: 0 if x["tools_to_manage_ml_experiments"] == 'None' else x["tools_to_manage_ml_experiments_count"], axis = 1)

In [None]:
kaggle_survey["publicly_share_da_or_ml_app"] = kaggle_survey.iloc[:,222:231].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["publicly_share_da_or_ml_app"] = kaggle_survey["publicly_share_da_or_ml_app"].replace("", "None")

In [None]:
kaggle_survey["publicly_share_da_or_ml_app_count"]= kaggle_survey["publicly_share_da_or_ml_app"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["publicly_share_da_or_ml_app_count"] = kaggle_survey.apply(lambda x: 0 if x["publicly_share_da_or_ml_app"] == 'None' else x["publicly_share_da_or_ml_app_count"], axis = 1)

In [None]:
kaggle_survey["platform_begun_ds_course"] = kaggle_survey.iloc[:,232:243].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["platform_begun_ds_course"] = kaggle_survey["platform_begun_ds_course"].replace("", "None")

In [None]:
kaggle_survey["platform_begun_ds_course_count"]= kaggle_survey["platform_begun_ds_course"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["platform_begun_ds_course_count"] = kaggle_survey.apply(lambda x: 0 if x["platform_begun_ds_course"] == 'None' else x["platform_begun_ds_course_count"], axis = 1)

In [None]:
kaggle_survey["favorite_media_for_ds_topic"] = kaggle_survey.iloc[:,245:256].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["favorite_media_for_ds_topic"] = kaggle_survey["favorite_media_for_ds_topic"].replace("", "None")

In [None]:
kaggle_survey["favorite_media_for_ds_topic_count"]= kaggle_survey["favorite_media_for_ds_topic"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["favorite_media_for_ds_topic_count"] = kaggle_survey.apply(lambda x: 0 if x["favorite_media_for_ds_topic"] == 'None' else x["favorite_media_for_ds_topic_count"], axis = 1)

In [None]:
kaggle_survey["computing_platform_to_learn_next_2_years"] = kaggle_survey.iloc[:,257:268].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["computing_platform_to_learn_next_2_years"] = kaggle_survey["computing_platform_to_learn_next_2_years"].replace("", "None")

In [None]:
kaggle_survey["computing_platform_to_learn_next_2_years_count"]= kaggle_survey["computing_platform_to_learn_next_2_years"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["computing_platform_to_learn_next_2_years_count"] = kaggle_survey.apply(lambda x: 0 if x["computing_platform_to_learn_next_2_years"] == 'None' else x["computing_platform_to_learn_next_2_years_count"], axis = 1)

In [None]:
kaggle_survey["computing_product_to_learn_next_2_years"] = kaggle_survey.iloc[:,269:280].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["computing_product_to_learn_next_2_years"] = kaggle_survey["computing_product_to_learn_next_2_years"].replace("", "None")

In [None]:
kaggle_survey["computing_product_to_learn_next_2_years_count"]= kaggle_survey["computing_product_to_learn_next_2_years"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["computing_product_to_learn_next_2_years_count"] = kaggle_survey.apply(lambda x: 0 if x["computing_product_to_learn_next_2_years"] == 'None' else x["computing_product_to_learn_next_2_years_count"], axis = 1)

In [None]:
kaggle_survey["ml_product_to_learn_next_2_years"] = kaggle_survey.iloc[:,281:291].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["ml_product_to_learn_next_2_years"] = kaggle_survey["ml_product_to_learn_next_2_years"].replace("", "None")

In [None]:
kaggle_survey["ml_product_to_learn_next_2_years_count"]= kaggle_survey["ml_product_to_learn_next_2_years"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["ml_product_to_learn_next_2_years_count"] = kaggle_survey.apply(lambda x: 0 if x["ml_product_to_learn_next_2_years"] == 'None' else x["ml_product_to_learn_next_2_years_count"], axis = 1)

In [None]:
kaggle_survey["big_data_product_to_learn_next_2_years"] = kaggle_survey.iloc[:,292:309].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["big_data_product_to_learn_next_2_years"] = kaggle_survey["big_data_product_to_learn_next_2_years"].replace("", "None")

In [None]:
kaggle_survey["big_data_product_to_learn_next_2_years_count"]= kaggle_survey["big_data_product_to_learn_next_2_years"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["big_data_product_to_learn_next_2_years_count"] = kaggle_survey.apply(lambda x: 0 if x["big_data_product_to_learn_next_2_years"] == 'None' else x["big_data_product_to_learn_next_2_years_count"], axis = 1)

In [None]:
kaggle_survey["bi_tools_to_learn_next_2_years"] = kaggle_survey.iloc[:,310:324].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["bi_tools_to_learn_next_2_years"] = kaggle_survey["bi_tools_to_learn_next_2_years"].replace("", "None")

In [None]:
kaggle_survey["bi_tools_to_learn_next_2_years_count"]= kaggle_survey["bi_tools_to_learn_next_2_years"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["bi_tools_to_learn_next_2_years_count"] = kaggle_survey.apply(lambda x: 0 if x["bi_tools_to_learn_next_2_years"] == 'None' else x["bi_tools_to_learn_next_2_years_count"], axis = 1)

In [None]:
kaggle_survey["category_automated_ml_tools_to_learn_next_2_years"] = kaggle_survey.iloc[:,325:332].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["category_automated_ml_tools_to_learn_next_2_years"] = kaggle_survey["category_automated_ml_tools_to_learn_next_2_years"].replace("", "None")

In [None]:
kaggle_survey["category_automated_ml_tools_to_learn_next_2_years_count"]= kaggle_survey["category_automated_ml_tools_to_learn_next_2_years"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["category_automated_ml_tools_to_learn_next_2_years_count"] = kaggle_survey.apply(lambda x: 0 if x["category_automated_ml_tools_to_learn_next_2_years"] == 'None' else x["category_automated_ml_tools_to_learn_next_2_years_count"], axis = 1)

In [None]:
kaggle_survey["specific_automated_ml_tools_to_learn_next_2_years"] = kaggle_survey.iloc[:,333:344].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["specific_automated_ml_tools_to_learn_next_2_years"] = kaggle_survey["specific_automated_ml_tools_to_learn_next_2_years"].replace("", "None")

In [None]:
kaggle_survey["specific_automated_ml_tools_to_learn_next_2_years_count"]= kaggle_survey["specific_automated_ml_tools_to_learn_next_2_years"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["specific_automated_ml_tools_to_learn_next_2_years_count"] = kaggle_survey.apply(lambda x: 0 if x["specific_automated_ml_tools_to_learn_next_2_years"] == 'None' else x["specific_automated_ml_tools_to_learn_next_2_years_count"], axis = 1)

In [None]:
kaggle_survey["mlexp_management_tool_to_learn_next_2_years"] = kaggle_survey.iloc[:,345:356].apply(lambda x: ','.join(sorted(x.dropna())), axis=1)

In [None]:
kaggle_survey["mlexp_management_tool_to_learn_next_2_years"] = kaggle_survey["mlexp_management_tool_to_learn_next_2_years"].replace("", "None")

In [None]:
kaggle_survey["mlexp_management_tool_to_learn_next_2_years_count"]= kaggle_survey["mlexp_management_tool_to_learn_next_2_years"].str.count(',').add(1).groupby(kaggle_survey.index).sum()

In [None]:
kaggle_survey["mlexp_management_tool_to_learn_next_2_years_count"] = kaggle_survey.apply(lambda x: 0 if x["mlexp_management_tool_to_learn_next_2_years"] == 'None' else x["mlexp_management_tool_to_learn_next_2_years_count"], axis = 1)

In [None]:
kaggle_survey_1 = kaggle_survey.rename({"Duration (in seconds)": 'duration_seconds', "What is your age (# years)?": "age_group", "What is your gender? - Selected Choice": "gender", "In which country do you currently reside?": "residence", "What is the highest level of formal education that you have attained or plan to attain within the next 2 years?": "highest_formal_education", "Select the title most similar to your current role (or most recent title if retired): - Selected Choice": "most_recent_job_role", "For how many years have you been writing code and/or programming?": "years_of_writing_code", "What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice": "programing_language_recommendation","What type of computing platform do you use most often for your data science projects? - Selected Choice": "type_of_computing_platform_for_ds_projects","Approximately how many times have you used a TPU (tensor processing unit)?": "times_of_tpu_usage", "For how many years have you used machine learning methods?": "years_of_using_ml_methods", "What is the size of the company where you are employed?": "size_of_company_employment", "Approximately how many individuals are responsible for data science workloads at your place of business?": "no_of_people_for_ds_workloads","Does your current employer incorporate machine learning methods into their business?": "incorporation_of_ml_by_employer", "What is your current yearly compensation (approximate $USD)?": "current_yearly_compensation_usd", "Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?": "money_spent_at_home_for_ml_usd", "Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice": "big_data_product_used_often", "Which of the following business intelligence tools do you use most often? - Selected Choice": "bi_tools_used_most_often", "What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice": "primary_tools_used_to_analyze"}, axis=1)

In [None]:
kaggle_survey_1["highest_formal_education"] = kaggle_survey_1["highest_formal_education"].fillna("None")

In [None]:
kaggle_survey_1["most_recent_job_role"] = kaggle_survey_1["most_recent_job_role"].fillna("None")

In [None]:
kaggle_survey_1["years_of_writing_code"] = kaggle_survey_1["years_of_writing_code"].fillna("None")

In [None]:
kaggle_survey_1["programing_language_recommendation"] = kaggle_survey_1["programing_language_recommendation"].fillna("None")

In [None]:
kaggle_survey_1["type_of_computing_platform_for_ds_projects"] = kaggle_survey_1["type_of_computing_platform_for_ds_projects"].fillna("None")

In [None]:
kaggle_survey_1["times_of_tpu_usage"] = kaggle_survey_1["times_of_tpu_usage"].fillna("None")

In [None]:
kaggle_survey_1["years_of_using_ml_methods"] = kaggle_survey_1["years_of_using_ml_methods"].fillna("None")

In [None]:
kaggle_survey_1["size_of_company_employment"] = kaggle_survey_1["size_of_company_employment"].fillna("None")

In [None]:
kaggle_survey_1["no_of_people_for_ds_workloads"] = kaggle_survey_1["no_of_people_for_ds_workloads"].fillna("None")

In [None]:
kaggle_survey_1["incorporation_of_ml_by_employer"] = kaggle_survey_1["incorporation_of_ml_by_employer"].fillna("None")

In [None]:
kaggle_survey_1["current_yearly_compensation_usd"] = kaggle_survey_1["current_yearly_compensation_usd"].fillna("None")

In [None]:
kaggle_survey_1["money_spent_at_home_for_ml_usd"] = kaggle_survey_1["money_spent_at_home_for_ml_usd"].fillna("None")

In [None]:
kaggle_survey_1["big_data_product_used_often"] = kaggle_survey_1["big_data_product_used_often"].fillna("None")

In [None]:
kaggle_survey_1["bi_tools_used_most_often"] = kaggle_survey_1["bi_tools_used_most_often"].fillna("None")

In [None]:
kaggle_survey_1["primary_tools_used_to_analyze"] = kaggle_survey_1["primary_tools_used_to_analyze"].fillna("None")

In [None]:
kaggle_survey_1["duration_hours"] = kaggle_survey_1["duration_seconds"]/60
kaggle_survey_1["duration_hours"] = kaggle_survey_1["duration_hours"].round(decimals=2)

In [None]:
def new(x):
    if (x["duration_hours"]> 0) and (x["duration_hours"] <= 5.99):
        return "0-5.99"
    elif (x["duration_hours"]> 6) and (x["duration_hours"] <= 10.99):
        return "6-10.99"
    elif (x["duration_hours"]> 11) and (x["duration_hours"] <= 15.99):
        return "11-15.99"
    elif (x["duration_hours"]> 16) and (x["duration_hours"] <= 20.99):
        return "16-20.99"
    elif (x["duration_hours"]> 21) and (x["duration_hours"] <= 25.99):
        return "21-25.99"
    elif (x["duration_hours"]> 25) and (x["duration_hours"] <= 30.99):
        return "26-30.99"
    elif (x["duration_hours"]> 30) and (x["duration_hours"] <= 35.99):
        return "31-35.99"
    elif (x["duration_hours"]> 35) and (x["duration_hours"] <= 40.99):
        return "36-40.99"
    elif (x["duration_hours"]> 40.99):
        return ">40.99"
kaggle_survey_1 = kaggle_survey_1.assign(duration_hours_group=kaggle_survey_1.apply(new, axis=1))

In [None]:
country_data_1.columns

In [None]:
kaggle_survey_1 = pd.merge(kaggle_survey_1, country_data_1, on= 'residence')

In [None]:
kaggle_survey_1.head()

# Data Analysis

## 1. General Behaviours

### i. Overview

From this sankey diagram a basic overview of the participants can be deduced. 

- Majority of the participants for this survey has been Male, followed by Female. Least participated respondents are NonBinary or did not disclose their gender.
- Among Male and Female  participants, more than half participants are between 18-29 years old.
- There has been a noticable number of participants did not mention their job in the survey and their education as well. 
- 60% of the participants are Students, Data Scientists, Software Engineer or holding other jobs and have Bachelor's and Master's Degree or atleast one of them.
- Again, noticable amount of people who have undertaken the survey are not currently employed.
- Participant's who has Bachelor's and Master's Degree has around 0-5 years of programming experiences.


In [None]:
def generateSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    colorPalette = ["#5858CA","#0C5246","#A11C5E","#B05C03","#9619DF","#C13636","#666666","#CE9B00","#7DB4C4", "#134F5C", "#A011E5"]
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    labelList = list(dict.fromkeys(labelList))
    
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
        
    for i in range(len(cat_cols)-1):
        if i==0:
            startendDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            startendDf.columns = ['start','end','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['start','end','count']
            startendDf = pd.concat([startendDf,tempDf])
        startendDf = startendDf.groupby(['start','end']).agg({'count':'sum'}).reset_index()
        
    startendDf['startID'] = startendDf['start'].apply(lambda x: labelList.index(x))
    startendDf['endID'] = startendDf['end'].apply(lambda x: labelList.index(x))
    
    data = dict(
        type='sankey',
        orientation = "h",
        valueformat = ".0f",
        node = dict(
          pad = 10,
          thickness = 10,
          line = dict(
            color = "black",
            width = 1
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = startendDf['startID'],
          target = startendDf['endID'],
          value = startendDf['count']
        )
      )
    
    layout =  dict(
        title = title,
        height = 1200,
        width = 1000,
        font = dict(
          size = 10
        )
    )
       
    fig = dict(data=[data], layout=layout)
    return fig

In [None]:
sankey = kaggle_survey_1.groupby(["age_group","gender","highest_formal_education","most_recent_job_role","years_of_writing_code"]).size().rename_axis(["Age Group","Gender","Highest Education","Most Recent Job Role", "Years of Writing Code"]).reset_index(name="Count of Survey Taker")

In [None]:
fig = generateSankey(sankey,cat_cols=["Gender","Age Group","Most Recent Job Role", "Highest Education","Years of Writing Code"],value_cols="Count of Survey Taker",title='General Behaviour of Survey Participators')
iplot(fig, validate=False)

### ii. Duration of Survey Completion


- In Hours, the survey took majority of the participants 0 to 10.99 hours to complete. 
- However, as it can be seen from one section, the fourth highest duration took some participants (11% as it has been highlighted in pie chart) more than 40.99 hours to complete this survey.
- Since Male participation was the highest, the duration hours all of them includes 70% male.

In [None]:
hours_group = kaggle_survey_1["duration_hours_group"].value_counts(normalize=True).rename_axis("Duration in Hours").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(14, 10)
sns.barplot(x="Duration in Hours", y="Count of Survey Taker", data = hours_group,palette="Blues_d")
plt.show()


In [None]:
explode = (0,0,0,0.1,0,0,0,0,0) 
colors = ['#001CF0','#0071C6','#2373E6', '#A00019', '#0D95E9', '#008DB8', '#00AAAA',
          '#00C69C', '#00E28E', '#00FF80', ]
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
plt.pie(hours_group["Count of Survey Taker"],explode=explode,colors=colors,autopct='%1.1f%%',shadow=True, startangle=90)
plt.title('Percentage of Users for Each Duration in Hours')
plt.axis('equal')
plt.tight_layout()
plt.legend(labels= hours_group["Duration in Hours"], loc="best")
plt.show()

In [None]:
hr_gen_group = kaggle_survey_1.groupby(["duration_hours_group","gender"]).size().rename_axis(["Duration in Hours", "Gender"]).reset_index(name="Count of Survey Taker")
a = hr_gen_group.groupby("Duration in Hours")["Count of Survey Taker"].transform('sum')
hr_gen_group["Count of Survey Taker"] = hr_gen_group["Count of Survey Taker"].div(a)
sort_list = hr_gen_group.groupby("Duration in Hours").sum().sort_values("Count of Survey Taker", ascending= False).index
colors = ["#C90909","#666666","#388B15","#F0940B",
          "#0405BB", ]
(hr_gen_group.pivot(index= "Gender", columns= "Duration in Hours", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='bar',stacked=True, figsize=(14,10),color=colors)
)

### iii. Age Group of Survey Participators

- As we have seen in the overview section, participation of 18 to 29 years old has been the highest in the survey for 2020.
- The age range go upto 70+, although the participation is the least in the referred age.
- From the Pie chart it can be seen that the highlighted section is around 13% in total, which includes age group 45+ participants. The survey outreached the younger generation more than elder generation.
- Female participation is less but in terms of age, female participation was more for 18 to 39 years old and the least was for 70+ years old.

In [None]:
age_group = kaggle_survey_1["age_group"].value_counts(normalize=True).rename_axis("Age Group").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(14, 10)
sns.barplot(x="Count of Survey Taker", y="Age Group", data = age_group,palette="Greens_d")
plt.show()

In [None]:
explode = (0,0,0,0,0,0,0.25,0.2,0.15,0.1,0.05) 
colors = ['#398717','#479F21','#3AAE6C', '#57C786', '#57C7A4', '#00AAAA', '#D28792',
          '#C1737F', '#B6626F','#B84355','#A00019']
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
plt.pie(age_group["Count of Survey Taker"],explode=explode,colors=colors,autopct='%1.1f%%',shadow=True, startangle=90)
plt.title('Percentage of Users from Each Age Group')
plt.axis('equal')
plt.tight_layout()
plt.legend(labels= age_group["Age Group"], loc="best")
plt.show()

In [None]:
age_gen_group = kaggle_survey_1.groupby(["age_group","gender"]).size().rename_axis(["Age Group", "Gender"]).reset_index(name="Count of Survey Taker")
a = age_gen_group.groupby("Age Group")["Count of Survey Taker"].transform('sum')
age_gen_group["Count of Survey Taker"] = age_gen_group["Count of Survey Taker"].div(a)
sort_list = age_gen_group.groupby("Age Group").sum().sort_values("Count of Survey Taker", ascending= True).index
colors = ["#0405BB","#666666","#388B15","#F0940B",
          "#C90909", ]
(age_gen_group.pivot(index= "Gender", columns= "Age Group", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='barh',stacked=True, figsize=(20,10),color=colors)
)



### iv. Residence of Participants and Region


- 50% the participants for this survey come from Asia and Americas region.
- Among Top 10 countries India is the top county to provide most responses, followed by USA, Brazil and Japan.
- Among Last 10 countries Ghana is the county to provide least responses, followed by Ireland, Belarus and UAE. These countries either had the least access to this survey or the data science, ML and AI is not most familiar topic there.
- Among participants from regions, participants in Ocenia, Americas and Asia tend to response faster in between 0 to 5.99 hrs.
- Major response for this survey came in between 6 to 10.99 hrs timeline after survey release.
- Youth between 18 to 29 years old are more likely to response from Asia and Africa compared to 25 to 44 years old from the rest of the regions.

In [None]:
region = kaggle_survey_1["region"].value_counts(normalize=True).rename_axis("Region").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.barplot(x="Region", y="Count of Survey Taker", data = region,palette="Reds_d")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
region = kaggle_survey_1["residence"].value_counts(normalize=True).rename_axis("Residence").reset_index(name="Count of Survey Taker")
list_countries = region['Residence'].unique().tolist()
d_country_code = {}  # To hold the country names and their ISO
for country in list_countries:
    try:
        country_data = pycountry.countries.search_fuzzy(country)
        country_code = country_data[0].alpha_3
        d_country_code.update({country: country_code})
    except:
        print('could not add ISO 3 code for ->', country)
        d_country_code.update({country: ' '})
for k, v in d_country_code.items():
    region.loc[(region.Residence == k), 'iso_alpha'] = v
fig = px.choropleth(data_frame = region, 
                    locations= "iso_alpha",
                    color= "Count of Survey Taker",
                    hover_name= "Residence",
                    color_continuous_scale='RdBu',
                   )

fig.show()



In [None]:
residence = kaggle_survey_1["residence"].value_counts(normalize=True)[:20].rename_axis("Top Residence").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.barplot(x="Top Residence", y="Count of Survey Taker", data = residence,palette="Reds_d")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
residence = kaggle_survey_1["residence"].value_counts(normalize=True)[20:].rename_axis("Last Residence").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.barplot(x="Last Residence", y="Count of Survey Taker", data = residence,palette="Reds_d")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
reg_hrs_group = kaggle_survey_1.groupby(["region","duration_hours_group"]).size().rename_axis(["Region", "Duration in Hours"]).reset_index(name="Count of Survey Taker")
a = reg_hrs_group.groupby("Region")["Count of Survey Taker"].transform('sum')
reg_hrs_group["Count of Survey Taker"] = reg_hrs_group["Count of Survey Taker"].div(a)
sort_list = reg_hrs_group.groupby("Region").sum().sort_values("Count of Survey Taker", ascending= False).index
colors = ["#0C5246","#A11C5E","#B05C03","#5858CA",
          "#9619DF","#C13636","#666666","#CE9B00","#7DB4C4", ]
(reg_hrs_group.pivot(index= "Duration in Hours", columns= "Region", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='bar',stacked=True, figsize=(20,10),color=colors)
)

In [None]:
reg_age_group = kaggle_survey_1.groupby(["region","age_group"]).size().rename_axis(["Region", "Age Group"]).reset_index(name="Count of Survey Taker")
a = reg_age_group.groupby("Region")["Count of Survey Taker"].transform('sum')
reg_age_group["Count of Survey Taker"] = reg_age_group["Count of Survey Taker"].div(a)
sort_list = reg_age_group.groupby("Region").sum().sort_values("Count of Survey Taker", ascending= False).index
colors = ["#0C5246","#A11C5E","#B05C03","#5858CA",
          "#9619DF","#C13636","#666666","#CE9B00","#7DB4C4", "#134F5C", "#A011E5"]
(reg_age_group.pivot(index= "Age Group", columns= "Region", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='bar',stacked=True, figsize=(20,10),color=colors)
)

### v. Gender

- The least representated gender in this study was woman, non-binary and people who refused to mention their gender or prefer to self-describe
- Women represtation were more in Americas, Africa and Asian Sector compared to total participation per region.
- There is a scope of improvement this study in future by approaching more woman from Ocenia and Europe.
- The same can be done by participants who are nonbinary and only coming from Ocenia region.

In [None]:
gender = kaggle_survey_1["gender"].value_counts(normalize=True).rename_axis("Gender").reset_index(name="Count of Survey Taker")
explode = (0,0.25,0.2,0.15,0.1) 
colors = ['#2373E6','#C1737F', '#B6626F','#B84355','#A00019', ]
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
plt.pie(gender["Count of Survey Taker"],explode=explode,colors=colors,autopct='%1.1f%%',shadow=True, startangle=90)
plt.title('Percentage of Users for Each Gender')
plt.axis('equal')
plt.tight_layout()
plt.legend(labels= gender["Gender"], loc="best")
plt.show()

In [None]:
reg_gen_group = kaggle_survey_1.groupby(["region","gender"]).size().rename_axis(["Region", "Gender"]).reset_index(name="Count of Survey Taker")
a = reg_gen_group.groupby("Region")["Count of Survey Taker"].transform('sum')
reg_gen_group["Count of Survey Taker"] = reg_gen_group["Count of Survey Taker"].div(a)
sort_list = reg_gen_group.groupby("Region").sum().sort_values("Count of Survey Taker", ascending= False).index
colors = ["#0450BB","#C90909","#388B15","#F0940B",
          "#666666", ]
(reg_gen_group.pivot(index= "Gender", columns= "Region", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='bar',stacked=True, figsize=(20,10),color=colors)
)




### vi. Education

- Masters Degree and Bachelor Degree holders are comprised of the most of the participants.
- However, we also see participants, although minor from participants who have not received any formal education past high school.
- Among each category of education option provided in the survey, all of them had more male to female ratio. However among all the categories, the female to male ratio was slightly higher for females who have Master's Degree or Doctoral Degree. At the same time some female participants chose not to respond anything.
- Asia and Africa had the most respondents coming from Bachelor's Degree background. On the other hand Ocenia,Europe and Americas had most respondents from Master's Degree Backgroup. 
- The results suggest that the survey outreached Bachelor's degree holders in Asia and Africa more or the participants in these region tend to delay masters or not do it at all compared to Ocenia, Americas and European Participants. 
- From the age group it can be seen that, Bachelor's Degree holder respondents are 18 to 29 years old and Master's Degree and Doctoral Degree holders have big part coming from 25 to 39 years old.

In [None]:
hours_group = kaggle_survey_1["highest_formal_education"].value_counts(normalize=True).rename_axis("Highest Formal Education").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(14, 10)
sns.barplot(x="Highest Formal Education", y="Count of Survey Taker", data = hours_group,palette="Greens_d")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
edu_gen_group = kaggle_survey_1.groupby(["highest_formal_education","gender"]).size().rename_axis(["Highest Formal Education", "Gender"]).reset_index(name="Count of Survey Taker")
a = edu_gen_group.groupby("Highest Formal Education")["Count of Survey Taker"].transform('sum')
edu_gen_group["Count of Survey Taker"] = edu_gen_group["Count of Survey Taker"].div(a)
sort_list = edu_gen_group.groupby("Highest Formal Education").sum().sort_values("Count of Survey Taker", ascending= True).index
colors = ["#0450BB","#C90909","#388B15","#F0940B",
          "#666666", ]
(edu_gen_group.pivot(index= "Gender", columns= "Highest Formal Education", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='barh',stacked=True, figsize=(20,10),color=colors)
)

In [None]:
reg_edu_group = kaggle_survey_1.groupby(["region","highest_formal_education"]).size().rename_axis(["Region", "Highest Formal Education"]).reset_index(name="Count of Survey Taker")
a = reg_edu_group.groupby("Region")["Count of Survey Taker"].transform('sum')
reg_edu_group["Count of Survey Taker"] = reg_edu_group["Count of Survey Taker"].div(a)
sort_list = reg_edu_group.groupby("Region").sum().sort_values("Count of Survey Taker", ascending= False).index
colors = ["#5858CA","#0C5246","#A11C5E","#B05C03",
          "#9619DF","#C13636","#666666","#CE9B00","#7DB4C4", "#134F5C", "#A011E5"]
(reg_edu_group.pivot(index= "Highest Formal Education", columns= "Region", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='bar',stacked=True, figsize=(20,10),color=colors)
)

In [None]:
edu_age_group = kaggle_survey_1.groupby(["highest_formal_education","age_group"]).size().rename_axis(["Highest Formal Education", "Age Group"]).reset_index(name="Count of Survey Taker")
a = edu_age_group.groupby("Highest Formal Education")["Count of Survey Taker"].transform('sum')
edu_age_group["Count of Survey Taker"] = edu_age_group["Count of Survey Taker"].div(a)
sort_list = edu_age_group.groupby("Highest Formal Education").sum().sort_values("Count of Survey Taker", ascending= False).index
colors = ["#0C5246","#A11C5E","#B05C03","#5858CA",
          "#9619DF","#C13636","#666666","#CE9B00","#7DB4C4", "#134F5C", "#A011E5"]
(edu_age_group.pivot(index= "Age Group", columns= "Highest Formal Education", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='bar',stacked=True, figsize=(20,10),color=colors)
)

### vii. Job

- 42.4% of the participants coming from Student, Other, None and Project Manager are the aspiring  Data Scientists/Engineers/Administratiors.
- There's a special part of respondents are unemployed now and they comprise of 22 to 39 years old. 
- Europe, Ocenia and Americas have the higher Data Scientists, Data Engineer and  Data Analysts respondance for this specific survey, where as Asia and Africa has more aspiring participants who wants to work in data field full time.
- In terms of gender, female data analysts and statistician responsded more than women in other sectors.

In [None]:
job = kaggle_survey_1["most_recent_job_role"].value_counts(normalize=True).rename_axis("Most Recent Job Role").reset_index(name="Count of Survey Taker")
explode = (0.05,0,0,0.1,0,0,0,0,0,0.15,0.2,0,0.05,0.1) 
colors = ['#A00019','#398717','#479F21','#B84355','#3AAE6C', '#57C786', '#57C7A4', '#00AAAA', '#3BBFBF','#D28792',
          '#C1737F','#66C3C3','#3CCACA','#67D8D8']
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
plt.pie(job["Count of Survey Taker"],explode=explode,colors=colors,autopct='%1.1f%%',shadow=True, startangle=90)
plt.title('Percentage of Users for Each Job Tile')
plt.axis('equal')
plt.tight_layout()
plt.legend(labels= job["Most Recent Job Role"], loc="best")
plt.show()

In [None]:
job_age_group = kaggle_survey_1.groupby(["most_recent_job_role","age_group"]).size().rename_axis(["Most Recent Job Role", "Age Group"]).reset_index(name="Count of Survey Taker")
a = job_age_group.groupby("Most Recent Job Role")["Count of Survey Taker"].transform('sum')
job_age_group["Count of Survey Taker"] = job_age_group["Count of Survey Taker"].div(a)
sort_list = job_age_group.groupby("Most Recent Job Role").sum().sort_values("Count of Survey Taker", ascending= False).index
colors = ["#0C5246","#A11C5E","#B05C03","#5858CA",
          "#9619DF","#C13636","#666666","#CE9B00","#7DB4C4", "#134F5C", "#A011E5"]
(job_age_group.pivot(index= "Age Group", columns= "Most Recent Job Role", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='bar',stacked=True, figsize=(20,10),color=colors)
)

In [None]:
reg_job_group = kaggle_survey_1.groupby(["region","most_recent_job_role"]).size().rename_axis(["Region", "Most Recent Job Role"]).reset_index(name="Count of Survey Taker")
a = reg_job_group.groupby("Region")["Count of Survey Taker"].transform('sum')
reg_job_group["Count of Survey Taker"] = reg_job_group["Count of Survey Taker"].div(a)
sort_list = reg_job_group.groupby("Region").sum().sort_values("Count of Survey Taker", ascending= False).index
colors = ["#5858CA","#0C5246","#A11C5E","#B05C03",
          "#9619DF","#C13636","#666666","#CE9B00","#7DB4C4", "#134F5C", "#A011E5"]
(reg_job_group.pivot(index= "Most Recent Job Role", columns= "Region", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='bar',stacked=True, figsize=(20,10),color=colors)
)

In [None]:
job_gen_group = kaggle_survey_1.groupby(["most_recent_job_role","gender"]).size().rename_axis(["Most Recent Job Role", "Gender"]).reset_index(name="Count of Survey Taker")
a = job_gen_group.groupby("Most Recent Job Role")["Count of Survey Taker"].transform('sum')
job_gen_group["Count of Survey Taker"] = job_gen_group["Count of Survey Taker"].div(a)
sort_list = job_gen_group.groupby("Most Recent Job Role").sum().sort_values("Count of Survey Taker", ascending= False).index
colors = ["#C90909","#666666","#388B15","#F0940B",
          "#0405BB", ]
(job_gen_group.pivot(index= "Gender", columns= "Most Recent Job Role", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='barh',stacked=True, figsize=(14,10),color=colors)
)

### viii. Programming

- Some student participators have 1 to 5 years of programming language experience according to the survey
- Most data scientists have programming experience in between 5 to 20 years.
- Most programming experience holder from these job sectors are 10 to 20 plus years and mosly comprises of Research Scientists and Software Engineers.
- In terms of Gender, more female participation can be noticed on no programming language section. It may be the audience sector for this survey or the coding skills education needs to be increased for women in and out of tech sector.

In [None]:
prog_job_group = kaggle_survey_1.groupby(["years_of_writing_code","most_recent_job_role"]).size().rename_axis(["Years of Writing Code", "Most Recent Job Role"]).reset_index(name="Count of Survey Taker")
a = prog_job_group.groupby("Years of Writing Code")["Count of Survey Taker"].transform('sum')
prog_job_group["Count of Survey Taker"] = prog_job_group["Count of Survey Taker"].div(a)
sort_list = prog_job_group.groupby("Years of Writing Code").sum().sort_values("Count of Survey Taker", ascending= False).index
colors = ["#5858CA","#0C5246","#A11C5E","#B05C03",
          "#9619DF","#C13636","#666666","#CE9B00","#7DB4C4", "#134F5C", "#A011E5"]
(prog_job_group.pivot(index= "Most Recent Job Role", columns= "Years of Writing Code", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='bar',stacked=True, figsize=(20,10),color=colors)
)

In [None]:
prog_gen_group = kaggle_survey_1.groupby(["years_of_writing_code","gender"]).size().rename_axis(["Years of Writing Code", "Gender"]).reset_index(name="Count of Survey Taker")
a = prog_gen_group.groupby("Years of Writing Code")["Count of Survey Taker"].transform('sum')
prog_gen_group["Count of Survey Taker"] = prog_gen_group["Count of Survey Taker"].div(a)
sort_list = prog_gen_group.groupby("Years of Writing Code").sum().sort_values("Count of Survey Taker", ascending= False).index
colors = ["#5858CA","#0C5246","#A11C5E","#B05C03",
          "#9619DF","#C13636","#666666","#CE9B00","#7DB4C4", "#134F5C", "#A011E5"]
(prog_gen_group.pivot(index= "Gender", columns= "Years of Writing Code", values= "Count of Survey Taker")
   .reindex(sort_list, axis='columns') 
   .T
   .plot(kind='bar',stacked=True, figsize=(20,10),color=colors)
)

## 2. Survey Q&A Responses

### i. Regular Programming Language
- Top regular programming language is SQL
- Most of the participants like to work with until 3 programming languages regularly

In [None]:
regular_prog = kaggle_survey_1["regular_programming_language"].value_counts(normalize=True)[:20].rename_axis("Top Regular Programming Language").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(14, 10)
sns.barplot(x="Top Regular Programming Language", y="Count of Survey Taker", data = regular_prog,palette="Oranges_d")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.distplot(kaggle_survey_1["regular_programming_language_count"], bins=20, color="Purple")


### ii. Programming Language Recommendation
- Most of the participants recommended Python, R and SQL as the top 3 programming language

In [None]:
prog_recom = kaggle_survey_1["programing_language_recommendation"].value_counts(normalize=True).rename_axis("Programming Language Recommendation").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(14, 10)
sns.barplot(y="Programming Language Recommendation", x="Count of Survey Taker", data = prog_recom,palette="RdBu")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### iii. Regular IDE
- Top regular IDE is VS Code
- Most of the participants like to work with until 2 IDE regularly

In [None]:
regular_ide = kaggle_survey_1["regular_ide"].value_counts(normalize=True)[:20].rename_axis("Top Regular Used IDE").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(x="Top Regular Used IDE", y="Count of Survey Taker", data = regular_ide,palette="Blues_d")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.distplot(kaggle_survey_1["regular_ide_count"], bins=20, color="lightcoral")

### iv. Regular Used Notebook
- Top regular used Notebook is Codelab Notebooks
- Most of the participants like to work with until 1 notebook regularly

In [None]:
regular_notebook = kaggle_survey_1["regular_notebook"].value_counts(normalize=True)[:20].rename_axis("Top Regular Used Notebook").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(x="Top Regular Used Notebook", y="Count of Survey Taker", data = regular_notebook, palette="Purples_d")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.distplot(kaggle_survey_1["regular_notebook_count"], bins=20, color="#B66F49")

### v. Type of Computing Platform for DS Projects
- Most of the participants like to work with Computing Platform "Personal Laptop"

In [None]:
type_comp_plat = kaggle_survey_1["type_of_computing_platform_for_ds_projects"].value_counts(normalize=True).rename_axis("Type of Computing Platform for DS Projects").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(14, 14)
sns.barplot(y="Type of Computing Platform for DS Projects", x="Count of Survey Taker", data = type_comp_plat,palette="GnBu")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### vi. Specialized Hardware
- Top specialized hardware is TPUs

In [None]:
spec_hardw = kaggle_survey_1["specialized_hardware"].value_counts(normalize=True).rename_axis("Top Regular Specialized Hardware").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(x="Top Regular Specialized Hardware", y="Count of Survey Taker", data = spec_hardw, palette="PiYG")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### vii. Times of TPU Usage
- Most of the participants do not like to work with TPUs.

In [None]:
tpu_usage = kaggle_survey_1["times_of_tpu_usage"].value_counts(normalize=True).rename_axis("Times of TPU Usage").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(14, 14)
sns.barplot(y="Times of TPU Usage", x="Count of Survey Taker", data = tpu_usage,palette="RdGy")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### viii. Regular Data Visualization Library
- Top Data Viz Library is Seaborn.
- Most of the participants like to work with 2 visualization libraries.

In [None]:
reg_data_vis_lib = kaggle_survey_1["regular_data_visualization_library"].value_counts(normalize=True)[:20].rename_axis("Top Regular Data Visualization Library").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(x="Top Regular Data Visualization Library", y="Count of Survey Taker", data = reg_data_vis_lib, palette="BrBG")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.distplot(kaggle_survey_1["regular_data_visualization_library_count"], bins=20, color="#764A19")

### ix. Years of Using ML Methods
- Most of the participants used ML methods under 1 year

In [None]:
yr_ml_lr = kaggle_survey_1["years_of_using_ml_methods"].value_counts(normalize=True).rename_axis("Years of Using Machine Learning").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(14, 14)
sns.barplot(y="Years of Using Machine Learning", x="Count of Survey Taker", data = yr_ml_lr,palette="RdBu")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### x. Regular Used ML Framework
- Top used Ml framework is Tensorflow or Keras.
- Most of the participants like to work with 2 ml frameworks.

In [None]:
reg_ml_fw = kaggle_survey_1["regular_ml_framework"].value_counts(normalize=True)[:20].rename_axis("Top Regular Used ML Framework").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(x="Top Regular Used ML Framework", y="Count of Survey Taker", data = reg_ml_fw, palette="PRGn")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.distplot(kaggle_survey_1["regular_ml_framework_count"], bins=20, color="#674EA7")

### xi. Regular Used ML Algorithm
- Top used Ml framework is Decision Tree or Random  Forest.
- Most of the participants like to work with 2 ml algorithm.

In [None]:
reg_ml_al = kaggle_survey_1["regular_ml_algorithm"].value_counts(normalize=True)[:20].rename_axis("Top Regular Used ML Algorithm").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(x= "Top Regular Used ML Algorithm", y="Count of Survey Taker", data = reg_ml_al, palette="RdYlBu")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.distplot(kaggle_survey_1["regular_ml_algorithm_count"], bins=20, color="#CC0000")

### xii. Size of Company Employment
- Most of the participants work in a 0-49 Employees Company

In [None]:
siz_emp_com = kaggle_survey_1["size_of_company_employment"].value_counts(normalize=True).rename_axis("Employee Number of the Company").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(14, 14)
sns.barplot(y="Employee Number of the Company", x="Count of Survey Taker", data = siz_emp_com,palette="PuOr")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### xiii. Employee Number for DS Workload
- Most of the participants share their workload with 1 to 2 people

In [None]:
workl_emp_com = kaggle_survey_1["no_of_people_for_ds_workloads"].value_counts(normalize=True).rename_axis("Employee Number for DS Workload").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(14, 14)
sns.barplot(y="Employee Number for DS Workload", x="Count of Survey Taker", data = workl_emp_com,palette="Set2")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### xiv. ML Used by the Company

In [None]:
ml_by_com = kaggle_survey_1["incorporation_of_ml_by_employer"].value_counts(normalize=True).rename_axis("ML Used By the Company").reset_index(name="Count of Survey Taker")


fig = px.treemap(ml_by_com, path=["ML Used By the Company"], values="Count of Survey Taker",
                  color="Count of Survey Taker",color_continuous_scale='RdBu')
fig.update_layout(margin=dict(t=5, b=5, r=1, l=1))
fig.show()

### xv. Important Activities at Work

In [None]:
ml_by_com = kaggle_survey_1["important_activities_at_work"].value_counts(normalize=True).rename_axis("Important Activities at the Work").reset_index(name="Count of Survey Taker")


fig = px.treemap(ml_by_com, path=["Important Activities at the Work"], values="Count of Survey Taker",
                  color="Count of Survey Taker",color_continuous_scale='PRGn')
fig.update_layout(margin=dict(t=5, b=5, r=1, l=1))
fig.show()

### xvi. Current Yearly Compensation USD
- Most of the participants get 0 to 999 USD as annual compensation

In [None]:
curr_year_compen_usd = kaggle_survey_1["current_yearly_compensation_usd"].value_counts(normalize=True).rename_axis("Current Yearly Compensation USD").reset_index(name="Count of Survey Taker")

sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(16, 20)
sns.barplot(y= "Current Yearly Compensation USD", x="Count of Survey Taker", data = curr_year_compen_usd, palette="RdYlBu")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### xvii. Money Spent at Home for ML USD
- Most of the participants do not spend money for ML

In [None]:
money_spent_ml_usd = kaggle_survey_1["money_spent_at_home_for_ml_usd"].value_counts(normalize=True).rename_axis("Money Spent At Home for ML USD").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(14, 14)
sns.barplot(x= "Money Spent At Home for ML USD", y="Count of Survey Taker", data = money_spent_ml_usd, palette="BrBG")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()


### xviii. Regular Cloud Computing Platforms

In [None]:
reg_cl_comp_ = kaggle_survey_1["regular_cloud_computing_platforms"].value_counts(normalize=True).rename_axis("Regular Cloud Computing Platform").reset_index(name="Count of Survey Taker")
fig = px.treemap(reg_cl_comp_, path=["Regular Cloud Computing Platform"], values="Count of Survey Taker",
                  color="Count of Survey Taker",color_continuous_scale='RdGy')
fig.update_layout(margin=dict(t=5, b=5, r=1, l=1))
fig.show()

### xix. Regular Cloud Computing Products
- Most of the participants like to use Google Compute Engine followed by AWS Lambda

In [None]:
reg_cl_comp_prd = kaggle_survey_1["regular_cloud_computing_products"].value_counts(normalize=True)[:20].rename_axis("Top Regular Cloud Computing Platform").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(y="Top Regular Cloud Computing Platform", x="Count of Survey Taker", data = reg_cl_comp_prd, palette="PRGn")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### xx. Regular ML Products
- Top used Ml product is Google Cloud ML

In [None]:
reg_ml_prd = kaggle_survey_1["regular_ml_products"].value_counts(normalize=True)[:20].rename_axis("Top Regular ML Products").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(y="Top Regular ML Products", x="Count of Survey Taker", data = reg_ml_prd, palette="Spectral")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### xxi. Regular Big Data Products
- Top used Big Data Products is Microsoft SQL Server
- Most of the participants like to work with 2 big data products

In [None]:
reg_big_data_prd = kaggle_survey_1["regular_big_data_products"].value_counts(normalize=True)[:20].rename_axis("Top Regular Big Data Products").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(y="Top Regular Big Data Products", x="Count of Survey Taker", data = reg_big_data_prd, palette="Greens_d")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.countplot(kaggle_survey_1["regular_big_data_products_count"], color="#717C18")

### xxii. Big Data Products Used Often
- Most of the participants like to work with MySQL for big data


In [None]:
prod_of = kaggle_survey_1["big_data_product_used_often"].value_counts(normalize=True).rename_axis("Big Data Product Used Often").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(y="Big Data Product Used Often", x="Count of Survey Taker", data = prod_of, palette="PiYG")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### xxiii. Regular BI Tools
- Top used BI Tool is Tableau.
- Most of the participants like to work with 2 BI Tools.

In [None]:
reg_bi_tools = kaggle_survey_1["regular_bi_tools"].value_counts(normalize=True)[:20].rename_axis("Top Regular BI Tools").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(x="Top Regular BI Tools", y="Count of Survey Taker", data = reg_bi_tools, palette="Blues_d")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.countplot(kaggle_survey_1["regular_bi_tools_count"], color="#630B85")

### xxiv. BI Products Used Often
- Top used BI product is Tableau

In [None]:
bi_prod_of = kaggle_survey_1["bi_tools_used_most_often"].value_counts(normalize=True).rename_axis("BI Product Used Often").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(y="BI Product Used Often", x="Count of Survey Taker", data = bi_prod_of, palette="PuOr")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### xxv. Top Regular Automated ML Tools
- Top used Automated Ml tool is Auto SKlearn
- Most of the participants like to work with 2 automated ML tools

In [None]:
reg_auto_ml_tools = kaggle_survey_1["regular_automated_ml_tools"].value_counts(normalize=True)[:20].rename_axis("Top Regular Automated ML Tools").reset_index(name="Count of Survey Taker")
fig = px.treemap(reg_auto_ml_tools, path=["Top Regular Automated ML Tools"], values="Count of Survey Taker",
                  color="Count of Survey Taker",color_continuous_scale='Spectral')
fig.update_layout(margin=dict(t=5, b=5, r=1, l=1))
fig.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.countplot(kaggle_survey_1["regular_automated_ml_tools_count"], color="#612523")

### xxvi. Top Tools to Manage ML Experiments
- Top used Automated Ml tool is Tensorboard
- Most of the participants like to work with 2 manage ML tools

In [None]:
reg_auto_ml_tools = kaggle_survey_1["tools_to_manage_ml_experiments"].value_counts(normalize=True)[:20].rename_axis("Top Tools To Manage ML Experience").reset_index(name="Count of Survey Taker")
fig = px.treemap(reg_auto_ml_tools, path=["Top Tools To Manage ML Experience"], values="Count of Survey Taker",
                  color="Count of Survey Taker",color_continuous_scale='PuOr')
fig.update_layout(margin=dict(t=5, b=5, r=1, l=1))
fig.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.distplot(kaggle_survey_1["tools_to_manage_ml_experiments_count"], bins=20, color="#302061")

### xxvii. Top Publicly Share DA or ML App
- Most of the participants do not like to share work publicly

In [None]:
pub_da_ml = kaggle_survey_1["publicly_share_da_or_ml_app"].value_counts(normalize=True)[:20].rename_axis("Top Publicly Share DA or ML App").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(x= "Top Publicly Share DA or ML App", y="Count of Survey Taker", data = pub_da_ml, palette="RdYlBu")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.distplot(kaggle_survey_1["publicly_share_da_or_ml_app_count"], bins=20, color="#660000")

### xxviii. Top Platform Begun DS Course
- Top used platform  begun DS course is University Courses
- Most of the participants like to work with 2 or more platforms to begin DS courses

In [None]:
pl_beg_ds_cr = kaggle_survey_1["platform_begun_ds_course"].value_counts(normalize=True)[:20].rename_axis("Top Platform Begun DS Course").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(x= "Top Platform Begun DS Course", y="Count of Survey Taker", data = pl_beg_ds_cr, palette="BrBG")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.distplot(kaggle_survey_1["platform_begun_ds_course_count"], bins=20, color="#148530")

### xxix. Primary Tool Use for Analysis
- Most of the participants like to work with local development environments

In [None]:
prim_tool = kaggle_survey_1["primary_tools_used_to_analyze"].value_counts(normalize=True).rename_axis("Primary Tool to Analyze").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(y="Primary Tool to Analyze", x="Count of Survey Taker", data = prim_tool, palette="RdYlGn")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

### xxx. Top Favourite Media for DS Topic
- Top used favourite media for DS Topic is Kaggle
- Most of the participants like to work with 12 or more platforms

In [None]:
fav_media_for_ds = kaggle_survey_1["favorite_media_for_ds_topic"].value_counts(normalize=True)[:20].rename_axis("Top Favourite Media for DS Topic").reset_index(name="Count of Survey Taker")
fig = px.treemap(fav_media_for_ds, path=["Top Favourite Media for DS Topic"], values="Count of Survey Taker",
                  color="Count of Survey Taker",color_continuous_scale='BrBG')
fig.update_layout(margin=dict(t=5, b=5, r=1, l=1))
fig.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.distplot(kaggle_survey_1["favorite_media_for_ds_topic_count"], bins=20, color="#444444")

### xxxi. Computing Platform to Learn Next 2 Years
- Top computing platform to learn is GCP
- Most of the participants like to learn 12 or more platforms.

In [None]:
com_plat_2_yrs = kaggle_survey_1["computing_platform_to_learn_next_2_years"].value_counts(normalize=True)[:20].rename_axis("Top Cloud Platform to Learn Next 2 Years").reset_index(name="Count of Survey Taker")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 20)
sns.barplot(y= "Top Cloud Platform to Learn Next 2 Years", x="Count of Survey Taker", data = com_plat_2_yrs, palette="Reds_d")
plt.xticks(rotation=30, horizontalalignment="right")
plt.show()

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(20, 14)
sns.distplot(kaggle_survey_1["favorite_media_for_ds_topic_count"], bins=20, color="#8E7CC3")

### xxxii. Computing Product to Learn Next 2 Years

In [None]:
com_prod_2_yrs = kaggle_survey_1["computing_product_to_learn_next_2_years"].value_counts(normalize=True)[:20].rename_axis("Top Computing Product to Learn Next 2 Years").reset_index(name="Count of Survey Taker")

fig = px.treemap(com_prod_2_yrs, path=["Top Computing Product to Learn Next 2 Years"], values="Count of Survey Taker",
                  color="Count of Survey Taker",color_continuous_scale='RdGy')
fig.update_layout(margin=dict(t=5, b=5, r=1, l=1))
fig.show()

In [None]:
ml_product_2_yrs = kaggle_survey_1["ml_product_to_learn_next_2_years"].value_counts(normalize=True)[:20].rename_axis("Top ML Products to Learn Next 2 Years").reset_index(name="Count of Survey Taker")

fig = px.treemap(ml_product_2_yrs, path=["Top ML Products to Learn Next 2 Years"], values="Count of Survey Taker",
                  color="Count of Survey Taker",color_continuous_scale='RdBu')
fig.update_layout(margin=dict(t=5, b=5, r=1, l=1))
fig.show()

### xxxiii. Big Data Product to Learn Next 2 Years

In [None]:
big_data_product_2_yrs = kaggle_survey_1["big_data_product_to_learn_next_2_years"].value_counts(normalize=True)[:20].rename_axis("Top Big Data Products to Learn Next 2 Years").reset_index(name="Count of Survey Taker")

fig = px.treemap(big_data_product_2_yrs, path=["Top Big Data Products to Learn Next 2 Years"], values="Count of Survey Taker",
                  color="Count of Survey Taker",color_continuous_scale='PuOR')
fig.update_layout(margin=dict(t=5, b=5, r=1, l=1))
fig.show()

### xxxiv. Big Data Product to Learn Next 2 Years

In [None]:
bi_product_2_yrs = kaggle_survey_1["bi_tools_to_learn_next_2_years"].value_counts(normalize=True)[:20].rename_axis("Top BI Products to Learn Next 2 Years").reset_index(name="Count of Survey Taker")

fig = px.treemap(bi_product_2_yrs, path=["Top BI Products to Learn Next 2 Years"], values="Count of Survey Taker",
                  color="Count of Survey Taker",color_continuous_scale='RdYlBu')
fig.update_layout(margin=dict(t=5, b=5, r=1, l=1))
fig.show()

# Conclusion

In conclusion it can be said that data field is constantly evolving with newer tools, products, platforms. But at the same time we have countries who are not fortunate enough to access all these information and platforms. Let alone we also have gender difference in data field. The survey puts a clear context on what people wants to do and the behavioural data can help us determine in future which fields we need to focus on to expand this field. In terms of platform popularity GCP Products can be seen gaining more popularity from the answers provided inside the survey. As per programming language Python, SQL and R still holding their popularity in data field.