In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib
import matplotlib.pyplot as plt

from plotly.subplots import make_subplots
import plotly.graph_objects as go



def country_normalization (x):
    if x == 'Iran, Islamic Republic of...':
        x = 'Iran'
    elif x == 'United Kingdom of Great Britain and Northern Ireland':
        x = "United Kingdom"
    elif x == 'United States of America':
        x = "USA"
    elif x == 'United States':
        x = "USA"
    elif x == 'Vietnam':
        x = "Viet Nam"
    elif x == 'Korea South':
        x = 'South Korea'
    elif x == 'Taiwan (2020 IFM Estimates)':
        x = 'Taiwan'
    elif x == 'Hong Kong (S.A.R.)':
        x = 'Hong Kong'
    return x

def create_age_2017 (x):
    if 18 <= x and x <=21:
        x = '18-21'
    elif 22 <= x and x <=24:
        x = '22-24'
    elif 25 <= x and x <=29:
        x = '25-29'
    elif 30 <= x and x <=34:
        x = '30-34'
    elif 35 <= x and x <=39:
        x = '35-39'
    elif 40 <= x and x <=44:
        x = '40-44'
    elif 45 <= x and x <=49:
        x = '45-49'
    elif 50 <= x and x <=54:
        x = '50-54'
    elif 55 <= x and x <=59:
        x = '55-59'
    elif 60 <= x and x <=69:
        x = '60-69'
    elif 70 <= x :
        x = '70+'
    else:
        x = 'other'
    return x

def count_then_return_percent(dataframe,column_name):
    
    counts = dataframe[column_name].value_counts(dropna=False)
    counts = counts.sort_index()

    percentages = round(counts*100/(dataframe[column_name].count()),1)
    return percentages

def count(dataframe,column_name):
    counts = dataframe[column_name].value_counts(dropna=False)
    counts = counts.sort_index()
    
    return counts

def plotly_compare_bar_charts( response_counts, text, title, subplot_titles, orientation, x_range ): 

    fig = make_subplots(
        rows=1, cols=len(response_counts),
        
        specs=[[{"type": "bar"}] * len(response_counts) ],
        shared_yaxes  = True,
        shared_xaxes  = True,
        subplot_titles = subplot_titles
    )

    for i in range(len(response_counts)):
        fig.add_trace(go.Bar( 
            y=response_counts[i].index, 
            x=response_counts[i].values, 
            text = text[i].values,
            textposition='auto',     
            orientation=orientation),
                      row=1, col=i+1)

    fig.update_layout(height=500, showlegend=False, title=title_for_chart)
    fig.update_xaxes(range=x_range)
    fig.show()    

def plot_custom_question_distribution (survey, title_for_chart,  question_name, countries, x_range = [0, 36] ):
    
    title_for_y_axis = '% of respondents'
    orientation_for_chart = 'h'    


    response_counts = []    
    text = []
    subplot_titles = []
    
    all_percentages = count_then_return_percent(survey,question_name).sort_index()
    all_counts = count(survey,question_name).sort_index()
    all_total = all_counts.sum()
    
    response_counts.append (all_percentages)
    text.append (all_counts)
    subplot_titles.append (f"All [#{all_total:,}]")
    
    data = []
    for country in countries:
        country_percentages = count_then_return_percent(survey.query("Q3 == @country"),question_name).sort_index()
        country_counts = count(survey.query("Q3 == @country"),question_name).sort_index()
        country_total = country_counts.sum()
        
        response_counts.append (country_percentages)
        text.append (country_counts)
        subplot_titles.append (f"{country} [#{country_total:,}]")

        
    plotly_compare_bar_charts(response_counts = response_counts, 
                              text = text,    
                              title = title_for_chart , 
                              subplot_titles = subplot_titles, 
                              orientation =orientation_for_chart,
                              x_range = x_range)   
    
def load_surveys ():
    surveys = {}

    survey_2021 = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv", low_memory=False)
    survey_2021["Q3"] = survey_2021["Q3"].map (lambda x: country_normalization(x))
    survey_2021 = survey_2021[1:]

    surveys[2021] = survey_2021 

    survey_2020 = pd.read_csv("/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv", low_memory=False)
    survey_2020["Q3"] = survey_2020["Q3"].map (lambda x: country_normalization(x))
    survey_2020 = survey_2020[1:]

    surveys[2020] = survey_2020 


    survey_2019 = pd.read_csv("/kaggle/input/kaggle-survey-2019/multiple_choice_responses.csv", low_memory=False)
    survey_2019["Q3"] = survey_2019["Q3"].map (lambda x: country_normalization(x))
    survey_2019 = survey_2019[1:]

    surveys[2019] = survey_2019 

    survey_2018 = pd.read_csv("/kaggle/input/kaggle-survey-2018/multipleChoiceResponses.csv", low_memory=False)
    survey_2018["Q2"] = survey_2018["Q2"].map (lambda x: '70+' if x in ('70-79', '80+') else x) 
    survey_2018["Q3"] = survey_2018["Q3"].map (lambda x: country_normalization(x))
    survey_2018 = survey_2018[1:]

    surveys[2018] = survey_2018 

    survey_2017 = pd.read_csv("/kaggle/input/kaggle-survey-2017/multipleChoiceResponses.csv", encoding = "ISO-8859-1",  low_memory=False)
    survey_2017["Q1"] = survey_2017["Age"].map(lambda x:create_age_2017(x))
    survey_2017["Q3"] = survey_2017["Country"].map (lambda x: country_normalization(x))
    survey_2017 = survey_2017[1:]

    surveys[2017] = survey_2017 
    
    return surveys
    
    

In [None]:
surveys = load_surveys()
countries = [ "India", "USA", "Italy"]


## Age Distribution

In [None]:
for year in [2021, 2020, 2019]:
    title_for_chart = f'Age Distribution - Kaggle Survey {year}'
    survey = surveys[year]
    question_name = 'Q1'
    plot_custom_question_distribution (survey, title_for_chart, question_name, countries)
    
for year in [2018]:
    title_for_chart = f'Age Distribution - Kaggle Survey {year}'
    survey = surveys[year]
    question_name = 'Q2'
    plot_custom_question_distribution (survey, title_for_chart, question_name, countries)
    

##  Highest level of formal education 

In [None]:
answer_abbrvs = {'Some college/university study without earning a bachelor’s degree':'Some college/university study'} 
cat_order = pd.CategoricalDtype(
    ['I prefer not to answer', 'No formal education past high school', 
     answer_abbrvs['Some college/university study without earning a bachelor’s degree'],
     'Professional doctorate', "Bachelor’s degree",  "Master’s degree", 'Doctoral degree'], 
    ordered=True
)


for year in [2021]:
    title_for_chart = f'Highest level of formal education - Kaggle Survey {year}'
    survey = surveys[year]
    question_name = 'Q4'
    survey[question_name] = survey[question_name].fillna("I prefer not to answer")
    survey[question_name] = survey[question_name].map (lambda x: answer_abbrvs.get(x,x) ) 
    survey[question_name] = survey[question_name].astype(cat_order)
    plot_custom_question_distribution (survey, title_for_chart, question_name, countries, x_range=[0,55])

cat_order = pd.CategoricalDtype(
    ['I prefer not to answer', 'No formal education past high school', 
     answer_abbrvs['Some college/university study without earning a bachelor’s degree'],
     'Professional degree',  "Bachelor’s degree",  "Master’s degree", 'Doctoral degree'], 
    ordered=True
)

for year in [2020,2019]:
    title_for_chart = f'Highest level of formal education - Kaggle Survey {year}'
    survey = surveys[year]
    question_name = 'Q4'
    survey[question_name] = survey[question_name].fillna("I prefer not to answer")
    survey[question_name] = survey[question_name].map (lambda x: answer_abbrvs.get(x,x) ) 
    survey[question_name] = survey[question_name].astype(cat_order)
    plot_custom_question_distribution (survey, title_for_chart, question_name, countries, x_range=[0,55])


## Developer seniority

In [None]:
cat_order = pd.CategoricalDtype(
    ['I have never written code', '< 1 years', '1-3 years', '3-5 years', '5-10 years', '10-20 years', '20+ years'], 
    ordered=True
)

for year in [2021]:
    title_for_chart = f'Developer seniority - Kaggle Survey {year}'
    survey = surveys[year]
    question_name = 'Q6'
    survey[question_name] = survey[question_name].astype(cat_order)
    plot_custom_question_distribution (survey, title_for_chart, question_name, countries)

cat_order = pd.CategoricalDtype(
    ['I have never written code', '< 1 years', '1-2 years', '3-5 years', '5-10 years', '10-20 years', '20+ years'], 
    ordered=True
)
    
for year in [2020]:
    title_for_chart = f'Developer seniority - Kaggle Survey {year}'
    survey = surveys[year]
    question_name = 'Q6'
    survey[question_name] = survey[question_name].astype(cat_order)
    plot_custom_question_distribution (survey, title_for_chart, question_name, countries)

for year in [2019]:
    title_for_chart = f'Developer seniority - Kaggle Survey {year}'
    survey = surveys[year]
    question_name = 'Q15'
    survey[question_name] = survey[question_name].astype(cat_order)
    plot_custom_question_distribution (survey, title_for_chart, question_name, countries)