# 2018-21 Kaggle ML/DS Survey

The code is adapted from [andresionek/one-chart-many-answers-kaggle-surveys-in-slopes](https://www.kaggle.com/andresionek/one-chart-many-answers-kaggle-surveys-in-slop) as well as [tkubacka/a-story-told-through-a-heatmap](https://www.kaggle.com/tkubacka/a-story-told-through-a-heatmap).

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# pd.set_option('display.max_rows', 100)
# pd.set_option('display.max_columns', 400)

In [None]:
from abc import ABC, abstractmethod
from enum import Enum
from typing import List, Type

import numpy as np
import pandas as pd


class Mapping(Enum):
    """
    Contains dicts mapping values found in the surveys to values we want to replace with.
    """
    COMPENSATION = {
        '$0-999': '0-10k',
        '1,000-1,999': '0-10k',
        '2,000-2,999': '0-10k',
        '3,000-3,999': '0-10k',
        '4,000-4,999': '0-10k',
        '5,000-7,499': '0-10k',
        '7,500-9,999': '0-10k',
        '10,000-14,999': '10-20k',
        '15,000-19,999': '10-20k',
        '20,000-24,999': '20-30k',
        '25,000-29,999': '20-30k',
        '30,000-39,999': '30-40k',
        '40,000-49,999': '40-50k',
        '50,000-59,999': '50-60k',
        '60,000-69,999': '60-70k',
        '70,000-79,999': '70-80k',
        '80,000-89,999': '80-90k',
        '90,000-99,999': '90-100k',
        '100,000-124,999': '100-125k',
        '125,000-149,999': '125-150k',
        '150,000-199,999': '150-200k',
        '200,000-249,999': '200-250k',
        '300,000-500,000': '300-500k',
        '> $500,000': '500-500k',
        '0-10,000': '0-10k',
        '10-20,000': '10-20k',
        '20-30,000': '20-30k',
        '30-40,000': '30-40k',
        '40-50,000': '40-50k',
        '50-60,000': '50-60k',
        '60-70,000': '60-70k',
        '70-80,000': '70-80k',
        '80-90,000': '80-90k',
        '90-100,000': '90-100k',
        '100-125,000': '100-125k',
        '125-150,000': '125-150k',
        '150-200,00': '150-200k',
        '200-250,000': '200-250k',
        '300-400,000': '300-500k',
        '400-500,000': '300-500k',
        '500,000+': '500-500k',
    }
    JOB_TITLE = {
        'Data Scientist': 'Data Scientist',
        'Software Engineer': 'Software Engineer',
        'Data Analyst': 'Data Analyst',
        'Other': 'Other',
        'Research Scientist': 'Research Scientist/Statistician',
        'Research Assistant': 'Research Scientist/Statistician',
        'Principal Investigator': 'Research Scientist/Statistician',
        'Business Analyst': 'Business Analyst',
        'Marketing Analyst': 'Business Analyst',
        'Product/Project Manager': 'Product/Project Manager',
        'Program/Project Manager': 'Program/Project Manager',
        'Product Manager': 'Program/Project Manager',
        'Data Engineer': 'Data Engineer/DBA',
        'Machine Learning Engineer': 'Machine Learning Engineer',
        'Statistician': 'Research Scientist/Statistician',
        'Manager': 'Manager/C-level',
        'DBA/Database Engineer': 'Data Engineer/DBA',
        'Chief Officer': 'Manager/C-level',
        'Consultant': 'Other',
        'Salesperson': 'Other',
        'Data Journalist': 'Other',
        'Developer Advocate': 'Other',
        'Developer Relations/Advocacy': 'Other',
        'Not employed': 'Currently not employed',
        'Currently not employed': 'Currently not employed',
        'Student': 'Student',
    }
    GENDER = {
        'Male': 'Male',
        'Female': 'Female',
        'Man': 'Male',
        'Woman': 'Female',
    }
    AGE = {
        '18-21': '18-21',
        '22-24': '22-24',
        '25-29': '25-29',
        '30-34': '30-34',
        '35-39': '35-39',
        '40-44': '40-44',
        '45-49': '45-49',
        '50-54': '50-54',
        '55-59': '55-59',
        '60-69': '60-69',
        '70+': '70+',
        '70-79': '70+',
        '80+': '70+'
    }
    EDUCATION = {
        'Master’s degree': 'Master’s',
        'Bachelor’s degree': 'Bachelor’s',
        'Some college/university study without earning a bachelor’s degree': 'College & below',
        'Doctoral degree': 'Doctoral',
        'Professional degree': 'Professional',
        'Professional doctorate': 'Professional',
        'No formal education past high school': 'College & below'
    }
    YEARS_WRITING_CODE = {
        '< 1 years': '0-1 years',
        '< 1 year': '0-1 years',
        '1-2 years': '1-3 years',
        '1-3 years': '1-3 years',
        '3-5 years': '3-5 years',
        '5-10 years': '5-10 years',
        '10-20 years': '10+ years',
        '20+ years': '10+ years',
        '20-30 years': '10+ years',
        '30-40 years': '10+ years',
        '40+ years': '10+ years',
    }
    RECOMMENDED_LANGUAGE = {
        'Python': 'Python',
        'R': 'R',
        'SQL': 'SQL',
        'C++': 'C++',
        'MATLAB': 'MATLAB',
        'Other': 'Other',
        'Java': 'Java',
        'C': 'C',
        'None': 'None',
        'Javascript': 'Javascript',
        'Julia': 'Julia',
        'Scala': 'Other',
        'SAS': 'Other',
        'Bash': 'Bash',
        'VBA': 'Other',
        'Go': 'Other',
        'Swift': 'Swift',
        'TypeScript': 'Other'
    }
    LANGUAGES = {
        'SQL': 'SQL',
        'R': 'R',
        'Java': 'Java',
        'MATLAB': 'MATLAB',
        'Python': 'Python',
        'Javascript/Typescript': 'Javascript/Typescript',
        'Bash': 'Bash',
        'Visual Basic/VBA': 'VBA',
        'Scala': 'Scala',
        'PHP': 'Other',
        'C/C++': 'C/C++',
        'Other': 'Other',
        'C#/.NET': 'Other',
        'Go': 'Other',
        'SAS/STATA': 'Other',
        'Ruby': 'Other',
        'Julia': 'Julia',
        'Javascript': 'Javascript/Typescript',
        'TypeScript': 'Javascript/Typescript',
        'C': 'C/C++',
        'C++': 'C/C++',
        'Swift': 'Swift'
    }
    YEARS_USING_ML = {
        '< 1 year': '0-1 years',
        '< 1 years': '0-1 years',
        'Under 1 year': '0-1 years',
        '1-2 years': '1-3 years',
        '2-3 years': '1-3 years',
        '3-4 years': '3-5 years',
        '4-5 years': '3-5 years',
        '5-10 years': '5-10 years',
        '10-15 years': '10+ years',
        '10-20 years': '10+ years',
        '20+ years': '10+ years',
        '20 or more years': '10+ years',
    }
    PRIMARY_TOOL = {
        'Local development environments (RStudio, JupyterLab, etc.)': 'Local or hosted development environments',
        'Basic statistical software (Microsoft Excel, Google Sheets, etc.)': 'Basic statistical software',
        'Local or hosted development environments (RStudio, JupyterLab, etc.)': 'Local or hosted development environments',
        'Cloud-based data software & APIs (AWS, GCP, Azure, etc.)': 'Cloud-based data software & APIs',
        'Other': 'Other',
        'Advanced statistical software (SPSS, SAS, etc.)': 'Advanced statistical software',
        'Business intelligence software (Salesforce, Tableau, Spotfire, etc.)': 'Business intelligence software',
        np.nan: 'None',
    }
    COUNTRY = {
        'Other': 'Other',
        'India': 'India',
        'United States of America': 'United States',
        'Brazil': 'Brazil',
        'Japan': 'Japan',
        'Russia': 'Russia',
        'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
        'Germany': 'Germany',
        'China': 'China',
        'Spain': 'Spain',
        'France': 'France',
    }
    IDE = {
        'MATLAB': 'MATLAB',
        'RStudio': 'RStudio',
        'Jupyter/IPython': 'Jupyter/IPython',
        'PyCharm': 'PyCharm',
        'Atom': 'Vim/Emacs/Atom',
        'Visual Studio': 'Visual Studio',
        'Notepad++': 'Notepad++/Sublime',
        'Sublime Text': 'Notepad++/Sublime',
        'IntelliJ': 'PyCharm',
        'Spyder': 'Spyder',
        'Visual Studio Code': 'Visual Studio',
        'Vim': 'Vim/Emacs/Atom',
        'Other': 'Other',
        'nteract': 'Other',
        'Vim / Emacs': 'Vim/Emacs/Atom',
        'Jupyter (JupyterLab, Jupyter Notebooks, etc)': 'Jupyter/IPython',
        'Jupyter Notebook': 'Jupyter/IPython',
        'Visual Studio / Visual Studio Code': 'Visual Studio',
        'Visual Studio Code (VSCode)': 'Visual Studio',
    }
    CLOUD = {
        'I have not used any cloud providers': "None",
        'Microsoft Azure': 'Azure',
        'Google Cloud Platform (GCP)': 'GCP',
        'Amazon Web Services (AWS)': 'AWS',
        'IBM Cloud': 'IBM/Red Hat',
        'Other': 'Other',
        'Alibaba Cloud': 'Alibaba',
        'Salesforce Cloud': 'Other',
        'Red Hat Cloud': 'IBM/Red Hat',
        'VMware Cloud': 'Other',
        'Alibaba Cloud': 'Alibaba',
        'SAP Cloud': 'Other',
        'IBM Cloud': 'IBM/Red Hat',
        'Oracle Cloud': 'Other',
        'IBM Cloud / Red Hat': 'IBM/Red Hat',
        'Tencent Cloud': 'Other',
    }
    ML_FRAMEWORKS = {
        'Prophet': 'Prophet',
        'Scikit-Learn': 'Scikit-learn',
        'Keras': 'Keras',
        'TensorFlow': 'TensorFlow',
        'Spark MLlib': 'Spark MLlib',
        'Spark MLib': 'Spark MLlib',
        'Xgboost': 'Xgboost',
        'randomForest': 'Other',
        'lightgbm': 'LightGBM',
        'Caret': 'Caret',
        'mlr': 'Other',
        'PyTorch': 'PyTorch',
        'Mxnet': 'Other',
        'CNTK': 'Other',
        'Caffe': 'Other',
        'H20': 'H2O',
        'catboost': 'CatBoost',
        'Fastai': 'Fast.ai',
        'Other': 'Other',
        'Scikit-learn': 'Scikit-learn',
        'RandomForest': 'Other',
        'Xgboost': 'Xgboost',
        'LightGBM': 'LightGBM',
        'Fast.ai': 'Fast.ai',
        'Tidymodels': 'Other',
        'CatBoost': 'CatBoost',
        'JAX': 'Other',
        'Prophet': 'Prophet',
        'H2O 3': 'H2O',
        'MXNet': 'Other',
        'PyTorch Lightning': 'PyTorch Lightning',
        'Huggingface': 'Huggingface',
    }
    ML_STATUS = {
        'No (we do not use ML methods)': 'Do not use ML / Do not know',
        'I do not know': 'Do not use ML / Do not know',
        'We recently started using ML methods (i.e., models in production for less than 2 years)': 'Recently started using ML',
        'We have well established ML methods (i.e., models in production for more than 2 years)':  'Well established ML',
        'We are exploring ML methods (and may one day put a model into production)': 'Exploring ML',
        'We use ML methods for generating insights (but do not put working models into production)': 'Use ML for generating insights',
        np.nan: 'Do not use ML / Do not know',
    }
    COURSES = {
        "Coursera": "Coursera",
        "Udemy": "Udemy",
        "DataCamp": "DataCamp",
        "Kaggle Learn Courses": "Kaggle Learn",
        "Kaggle Learn": "Kaggle Learn",
        "Kaggle Courses (i.e. Kaggle Learn)": "Kaggle Learn",
        "edX": "edX",
        "University Courses (resulting in a university degree)": "University",
        "Udacity": "Udacity",
        "LinkedIn Learning": "Other",
        "Fast.ai": "Other",
        "Cloud-certification programs (direct from AWS, Azure, GCP, or similar)": "Other",
        "Online University Courses": "Other",
        "Fast.AI": "Other",
        "DataQuest": "Other",
        "developers.google.com": "Other",
        "TheSchool.AI": "Other",
        "Other": "Other",
    }
    VIZ = {
        'Matplotlib': 'Matplotlib',
        'Seaborn': 'Seaborn',
        'Ggplot / ggplot2': 'ggplot2',
        'ggplot2': 'ggplot2',
        'Plotly / Plotly Express': 'Plotly',
        'Plotly': 'Plotly',
        'Shiny': 'Shiny',
        'Bokeh': 'Bokeh',
        'D3': 'D3',
        'D3 js': 'D3',
        'D3.js': 'D3',
        'Geoplotlib': 'Other',
        'Leaflet / Folium': 'Other',
        'Leaflet': 'Other',
        'Altair': 'Other',
        'Lattice': 'Other',
        'Other': 'Other',
    }
    MEDIA = {
        "Kaggle (notebooks, forums, etc)": "Kaggle",
        "Kaggle (forums, blog, social media, etc)": "Kaggle",
        "Kaggle forums": "Kaggle",
        "YouTube (Kaggle YouTube, Cloud AI Adventures, etc)": "YouTube",
        "YouTube (Cloud AI Adventures, Siraj Raval, etc)": "YouTube",
        "Siraj Raval YouTube Channel": "YouTube",
        "Cloud AI Adventures (YouTube)": "YouTube",
        "Blogs (Towards Data Science, Analytics Vidhya, etc)": "Blogs",
        "Blogs (Towards Data Science, Medium, Analytics Vidhya, KDnuggets etc)": "Blogs",
        "Medium Blog Posts": "Blogs",
        "KDnuggets Blog": "Blogs",
        "FastML Blog": "Blogs",
        "Journal Publications (peer-reviewed journals, conference proceedings, etc)": "Journals",
        "Journal Publications (traditional publications, preprint journals, etc)": "Journals",
        "Journal Publications": "Journals",
        "ArXiv & Preprints": "Journals",
        "Email newsletters (Data Elixir, O'Reilly Data & AI, etc)": "Newsletters",
        "O'Reilly Data Newsletter": "Newsletters",
        "Slack Communities (ods.ai, kagglenoobs, etc)  ": "Slack",
        "Course Forums (forums.fast.ai, Coursera forums, etc)": "Course Forums",
        "Course Forums (forums.fast.ai, etc)": "Course Forums",
        "Fastai forums": "Course Forums",
        "Reddit (r/machinelearning, etc) ": "Reddit",
        "Reddit (r/machinelearning, r/datascience, etc)": "Reddit",
        "Podcasts (Chai Time Data Science, O’Reilly Data Show, etc) ": "Podcasts",
        "Podcasts (Chai Time Data Science, Linear Digressions, etc) ": "Podcasts",
        "The Data Skeptic Podcast": "Podcasts",
        "Linear Digressions Podcast": "Podcasts",
        "Partially Derivative Podcast": "Podcasts",
        "None/I do not know": "None",
    }


class BaseSurvey(ABC):
    """
    Base class to handle cleaning and transformation of datasets from different years.
    """
    def __init__(self) -> None:
        self.df = None
        self.non_professionals = ["Student", "Currently not employed", np.nan]
        self.cols_to_rename = {}
        self.questions_to_combine = []
        self.survey_year = None

    def rename_columns(self) -> None:
        """
        Renames columns using mapping
        """
        self.df.rename(columns=self.cols_to_rename, inplace=True)

    @property
    def questions_to_keep(self) -> List[str]:
        """
        Select which questions we should keep in the dataframe using the mapping keys
        """
        return list(self.cols_to_rename.keys())

    def select_questions(self) -> pd.DataFrame:
        """
        Selects only the relevant questions from each survey year
        """
        self.df = self.df[self.questions_to_keep]
        return self.df

    @abstractmethod
    def filter_question_columns(self, question: str) -> List[str]:
        raise NotImplementedError

    def remove_non_professionals(self) -> pd.DataFrame:
        """
        Non-professionals were defined as students, unemployed and NaNs.
        Also removed those who didn"t disclose compensation.
        """
        self.df = self.df[~self.df["Job Title"].isin(self.non_professionals)]
        return self.df

    def combine_answers_into_list(self, question: str) -> pd.DataFrame:
        """
        This function will create a new column in the dataframe adding
        all answers to a list and removing nans and None.
        """
        filtered_columns = self.filter_question_columns(question)
        for c in filtered_columns:
            self.df[c] = self.df[c].str.strip()
        self.df[question] = self.df[filtered_columns].values.tolist()
        self.df[question] = self.df[question].apply(
            lambda lst: [x for x in lst if pd.notnull(x) and x != "None"])
        return self.df

    def batch_combine_answers_into_list(self) -> pd.DataFrame:
        """
        Applies combine_answers_into_list to multiple columns
        """
        for question in self.questions_to_combine:
            self.combine_answers_into_list(question)
        return self.df

    def do_mapping(self, column: str, mapping: Mapping, fill: str = None) -> pd.DataFrame:
        """
        Maps values to have same classes across all years
        """
        self.df[column] = self.df[column].map(mapping.value)
        if fill is not None:
            self.df[column] = self.df[column].fillna(fill)
        return self.df

    def do_list_mapping(self, column: str, mapping: Mapping) -> pd.DataFrame:
        """
        Maps values to have same classes across all years
        for columns that are list type
        """
        def _clean_list(lst):
            new_lst = set()
            for x in lst:
                if mapping.value.get(x):
                    if mapping.value[x] != "None":
                        new_lst.add(mapping.value[x])
                else:
                    new_lst.add("Other")
            return list(new_lst)

        self.df[column] = self.df[column].apply(_clean_list)
        return self.df

    def add_numeric_average_compensation(self) -> pd.DataFrame:
        """
        Create a numeric value for compensation in thousand of dollars,
        taking the average between the max and min values for each class

        We are summing up the lowest and highest value for each category,
        and then dividing by 2.
        Some regex needed to clean the text
        """
        compensation = (
            self.df["Compensation"]
            .str.replace(r"(?:(?!\d|\-).)*", "", regex=True)
            .str.split("-")
        )
        self.df["Compensation Value"] = compensation.apply(
            lambda x: (int(x[0]) + int(x[1])) / 2 if x == x else x)
        return self.df

    def add_survey_year_column(self) -> pd.DataFrame:
        """
        Adds the year the survey was taken
        """
        self.df["Survey Year"] = self.survey_year
        return self.df

    @staticmethod
    def _get_profile(values: tuple) -> str:
        years_code, years_ml = values
        if (
            years_code in ["0-1 years", "1-3 years"]
            and years_ml in ["0-1 years", "1-3 years"]
        ):
            return "Beginners"
        elif (
            years_code in ["3-5 years", "5-10 years"]
            and years_ml in ["3-5 years", "5-10 years"]
        ):
            return "Modern DS"
        elif (
            years_code == "10+ years"
            and years_ml in ["0-1 years", "1-3 years"]
        ):
            return "Coders"
        elif (
            years_code == "10+ years"
            and years_ml == "10+ years"
        ):
            return "ML Veterans"
        return "Others"

    def create_profiles(self) -> None:
        """
        This function creates a new column with profiles for professionals
        adapted from the work developed by Teresa Kubacka on last years survey
        https://www.kaggle.com/tkubacka/a-story-told-through-a-heatmap
        """
        self.df["Profile"] = self.df[["Years Writing Code", "Years Using ML"]].apply(
            self._get_profile, axis=1)

    def transform(self) -> pd.DataFrame:
        """
        Process and clean the dataset
        """
        self.df.drop(0, axis=0, inplace=True)

        self.batch_combine_answers_into_list()
        self.select_questions()
        self.rename_columns()
        self.remove_non_professionals()
    
        self.do_mapping(column="Compensation", mapping=Mapping.COMPENSATION)
        self.do_mapping(column="Gender", mapping=Mapping.GENDER)
        self.do_mapping(column="Education", mapping=Mapping.EDUCATION)
        self.df.dropna(subset=["Compensation", "Gender", "Education"], inplace=True)
        self.df.reset_index(drop=True, inplace=True)

        self.do_mapping(column="Age", mapping=Mapping.AGE)
        self.do_mapping(column="Country", mapping=Mapping.COUNTRY, fill="Other")
        self.do_mapping(column="Job Title", mapping=Mapping.JOB_TITLE)
        self.do_mapping(column="Years Writing Code", mapping=Mapping.YEARS_WRITING_CODE, fill="None")
        self.do_mapping(column="Recommended Language", mapping=Mapping.RECOMMENDED_LANGUAGE, fill="None")
        self.do_mapping(column="Years Using ML", mapping=Mapping.YEARS_USING_ML, fill="None")
        self.do_mapping(column="Primary Tool", mapping=Mapping.PRIMARY_TOOL)
        self.do_mapping(column="ML Status in Company", mapping=Mapping.ML_STATUS)

        self.do_list_mapping(column="Languages", mapping=Mapping.LANGUAGES)
        self.do_list_mapping(column="IDEs", mapping=Mapping.IDE)
        self.do_list_mapping(column="Data Visualization Libraries", mapping=Mapping.VIZ)
        self.do_list_mapping(column="ML Frameworks", mapping=Mapping.ML_FRAMEWORKS)
        self.do_list_mapping(column="Cloud Computing Platforms", mapping=Mapping.CLOUD)
        self.do_list_mapping(column="Data Science Courses", mapping=Mapping.COURSES)
        self.do_list_mapping(column="Media Sources", mapping=Mapping.MEDIA)

        self.create_profiles()
        self.add_numeric_average_compensation()
        self.add_survey_year_column()
        return self.df


class Survey2021(BaseSurvey):
    """
    Processing and cleaning 2021 Dataset
    """
    def __init__(self) -> None:
        BaseSurvey.__init__(self)
        self.survey_year = 2021
        self.df = pd.read_csv("/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv", low_memory=False)
        self.cols_to_rename = {
            "Q1": "Age",
            "Q2": "Gender",
            "Q3": "Country",
            "Q4": "Education",
            "Q5": "Job Title",
            "Q6": "Years Writing Code",
            "Q7": "Languages",
            "Q8": "Recommended Language",
            "Q9": "IDEs",
            "Q10": "Hosted Notebooks",
            "Q14": "Data Visualization Libraries",
            "Q15": "Years Using ML",
            "Q16": "ML Frameworks",
            "Q23": "ML Status in Company",
            "Q24": "Daily activities",
            "Q25": "Compensation",
            "Q27_A": "Cloud Computing Platforms",
            "Q28_A": "Cloud Computing Products",
            "Q31_A": "ML Products",
            "Q32_A": "Big Data Products",
            "Q40": "Data Science Courses",
            "Q41": "Primary Tool",
            "Q42": "Media Sources",
        }
        self.questions_to_combine = [
            "Q7", "Q9", "Q10", "Q14", "Q16", "Q24", "Q27_A", "Q28_A", "Q31_A", "Q32_A", "Q40", "Q42"
        ]

    def filter_question_columns(self, question: str) -> List[str]:
        """
        Filters only questions that starts with the question_number
        and do not end with the string _OTHER
        """
        return [
            col for col in self.df.columns
            if col.startswith(f"{question}_P")
            and not col.endswith("_OTHER")
        ]


class Survey2020(BaseSurvey):
    """
    Processing and cleaning 2020 Dataset
    """
    def __init__(self) -> None:
        BaseSurvey.__init__(self)
        self.survey_year = 2020
        self.df = pd.read_csv("/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv", low_memory=False)
        self.cols_to_rename = {
            "Q1": "Age",
            "Q2": "Gender",
            "Q3": "Country",
            "Q4": "Education",
            "Q5": "Job Title",
            "Q6": "Years Writing Code",
            "Q7": "Languages",
            "Q8": "Recommended Language",
            "Q9": "IDEs",
            "Q10": "Hosted Notebooks",
            "Q14": "Data Visualization Libraries",
            "Q15": "Years Using ML",
            "Q16": "ML Frameworks",
            "Q22": "ML Status in Company",
            "Q23": "Daily activities",
            "Q24": "Compensation",
            "Q26_A": "Cloud Computing Platforms",
            "Q27_A": "Cloud Computing Products",
            "Q28_A": "ML Products",
            "Q29_A": "Big Data Products",
            "Q37": "Data Science Courses",
            "Q38": "Primary Tool",
            "Q39": "Media Sources",
        }
        self.questions_to_combine = [
            "Q7", "Q9", "Q10", "Q14", "Q16", "Q23", "Q26_A", "Q27_A", "Q28_A", "Q29_A", "Q37", "Q39"
        ]

    def filter_question_columns(self, question: str) -> List[str]:
        """
        Filters only questions that starts with the question_number
        and do not end with the string _OTHER
        """
        return [
            col for col in self.df.columns
            if col.startswith(f"{question}_P")
            and not col.endswith("_OTHER")
        ]


class Survey2019(BaseSurvey):
    """
    Processing and cleaning 2019 Dataset
    """
    def __init__(self) -> None:
        BaseSurvey.__init__(self)
        self.survey_year = 2019
        self.df = pd.read_csv("/kaggle/input/kaggle-survey-2019/multiple_choice_responses.csv", low_memory=False)
        self.cols_to_rename = {
            "Q1": "Age",
            "Q2": "Gender",
            "Q3": "Country",
            "Q4": "Education",
            "Q5": "Job Title",
            "Q15": "Years Writing Code",
            "Q18": "Languages",
            "Q19": "Recommended Language",
            "Q16": "IDEs",
            "Q17": "Hosted Notebooks",
            "Q20": "Data Visualization Libraries",
            "Q23": "Years Using ML",
            "Q28": "ML Frameworks",
            "Q8": "ML Status in Company",
            "Q9": "Daily activities",
            "Q10": "Compensation",
            "Q29": "Cloud Computing Platforms",
            "Q30": "Cloud Computing Products",
            "Q32": "ML Products",
            "Q31": "Big Data Products",
            "Q13": "Data Science Courses",
            "Q14": "Primary Tool",
            "Q12": "Media Sources",
        }
        self.questions_to_combine = [
            "Q18", "Q16", "Q17", "Q20", "Q28", "Q9", "Q29", "Q30", "Q32", "Q31", "Q13", "Q12"
        ]

    def filter_question_columns(self, question: str) -> List[str]:
        """
        Filters only questions that starts with the question_number
        and do not end with the string _OTHER_TEXT
        """
        return [
            col for col in self.df.columns
            if col.startswith(f"{question}_P")
            and not col.endswith("_OTHER_TEXT")
        ]


class Survey2018(BaseSurvey):
    """
    Processing and cleaning 2018 Dataset
    """
    def __init__(self) -> None:
        BaseSurvey.__init__(self)
        self.survey_year = 2018
        self.df = pd.read_csv("/kaggle/input/kaggle-survey-2018/multipleChoiceResponses.csv", low_memory=False)
        self.cols_to_rename = {
            "Q2": "Age",
            "Q1": "Gender",
            "Q3": "Country",
            "Q4": "Education",
            "Q6": "Job Title",
            "Q24": "Years Writing Code",
            "Q16": "Languages",
            "Q18": "Recommended Language",
            "Q13": "IDEs",
            "Q14": "Hosted Notebooks",
            "Q21": "Data Visualization Libraries",
            "Q25": "Years Using ML",
            "Q19": "ML Frameworks",
            "Q10": "ML Status in Company",
            "Q11": "Daily activities",
            "Q9": "Compensation",
            "Q15": "Cloud Computing Platforms",
            "Q27": "Cloud Computing Products",
            "Q28": "ML Products",
            "Q30": "Big Data Products",
            "Q36": "Data Science Courses",
            "Q12_MULTIPLE_CHOICE": "Primary Tool",
            "Q38": "Media Sources",
        }
        self.questions_to_combine = [
            "Q16", "Q13", "Q14", "Q21", "Q19", "Q11", "Q15", "Q27", "Q28", "Q30", "Q36", "Q38"
        ]

    def filter_question_columns(self, question: str) -> List[str]:
        """
        Filters only questions that starts with the question_number
        and do not end with the string _OTHER_TEXT
        """
        return [
            col for col in self.df.columns
            if col.startswith(f"{question}_P")
            and not col.endswith("_OTHER_TEXT")
        ]


class CombinedSurvey:
    """
    This class combines surveys from multiple years into a concatenated dataframe.
    """
    def __init__(self, surveys=[Survey2018, Survey2019, Survey2020, Survey2021]) -> None:
        self.surveys = surveys
        self._cached_df = None

    @property
    def df(self) -> pd.DataFrame:
        """
        If df was already processed get it from cache, otherwise process it and saves to cache.
        """
        if isinstance(self._cached_df, type(None)):
            self._cached_df = self._concat_surveys()
        return self._cached_df

    def _concat_surveys(self) -> List[pd.DataFrame]:
        """
        Applies the transform method for each survey and return the dfs in a list
        """
        return pd.concat([survey().transform() for survey in self.surveys], ignore_index=True)

In [None]:
from typing import List

import pandas as pd
import plotly.graph_objects as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
from collections import namedtuple

CATEGORIES = {
    "Survey Year": [2018, 2019, 2020, 2021],
    "Job Title": [
        "Other", "Manager/C-level", "Product/Project Manager",
        "Business Analyst", "Data Analyst",
        "Research Scientist/Statistician",
        "Data Scientist", "Machine Learning Engineer",
        "Data Engineer/DBA", "Software Engineer"
    ],
    "Gender": ["Male", "Female"],
    "Age": [
        "18-21", "22-24", "25-29", "30-34", "35-39", "40-44",
        "45-49", "50-54", "55-59", "60-69", "70+"
    ],
    "Years Writing Code": [
        "None", "0-1 years", "1-3 years", "3-5 years", "5-10 years", "10+ years"
    ],
    "Years Using ML": [
        "None", "0-1 years", "1-3 years", "3-5 years", "5-10 years", "10+ years"
    ],
    "Education": [
        "College & below", "Professional",
        "Bachelor’s", "Master’s", "Doctoral"
    ],
    "Profile": ["Others", "Beginners", "Modern DS", "Coders", "ML Veterans"],
}


COLORS = {
    "Average": "#1f77b4",
    "None": "darkblue",

    "Female": "#ff9896",
    "Male": "#1f77b4",

    "22-24": "#FE9933",
    "40-44": "#179B3A",

    "Data Scientist": "#13A4B4",
    "Product/Project Manager": "#D70947",
    "Software Engineer": "#E8743B",
    "Data Analyst": "#BF399E",
    "Data Engineer/DBA": "#144B7F",

    "0-1 years": "#98df8a",
    "10+ years": "#2ca02c",

    "Python": "#17becf",
    "SQL": "#ff9896",
    "R": "#bcbd22",

    "Scikit-learn": "#bcbd22",
    "TensorFlow": "#d62728",
    "Keras": "#ff9896",
    "PyTorch": "#1f77b4",

    "Matplotlib": "#bcbd22",
    "Seaborn": "#2ca02c",

    "AWS": "#F79500",
    "GCP": "#1AA746",
    "Azure": "#3278B1",

    "Jupyter/IPython": "#EC7426",
    "Visual Studio": "#349FED",
    "RStudio": "#ff9896",

    "Coursera": "#F79500",
    "Kaggle Learn": "#1AA746",

    "Journals": "#F79500",
    "Kaggle": "#1AA746",

    "Basic statistical software": "#0D7036",
    "Local or hosted development environments": "#36B5E2",
    
    "Well established ML": "dodgerblue",
    "Do not use ML / Do not know": "slategrey",
    "Exploring ML": "lightseagreen",
    "Recently started using ML": "forestgreen"
}


MetricData = namedtuple(
    "MetricData",
    [
        "subplot_name", "trace_name", "y_values", "x_values",
        "subplot_col", "subplot_row", "highlighted_traces",
    ]
)

_STYLE = {
    "Change": {
        "ytitle": "Change in Avg Annual Compensation",
        "yticks": [-0.5, -0.25, 0, 0.25, 0.5],
        "yticks_template": "{:.0%}",
        "hover_template": "%{y:.1%}",
        "annotation_template": "{:.1%}",
        "annotation_mode": 0,
    },
    "Compensation Value": {
        "ytitle": "Avg Annual Compensation USD",
        "yticks": [20, 40, 60],
        "yticks_template": "${}K",
        "hover_template": "$%{y:.1f}K",
        "annotation_template": "${:.1f}K",
        "annotation_mode": 1,
    },
    "Prop": {
        "ytitle": "% Respondents",
        "yticks": [0, 0.2, 0.4, 0.6, 0.8],
        "yticks_template": "{:.0%}",
        "hover_template": "%{y:.1%}",
        "annotation_template": "{:.1%}",
        "annotation_mode": 2,
    },
}


def compute_prop(df, column, color, items):
    source = []
    for i in items:
        _df = df[['Survey Year', column]].copy()
        if i == "None":
            _df[i] = df[color].apply(lambda lst: not lst).astype(int)
        else:
            _df[i] = df[color].apply(lambda lst: i in lst).astype(int)

        _agg_df = _df.groupby(['Survey Year', column], as_index=False)[i].sum()
        _agg_df.rename(columns={i: "Count"}, inplace=True)
        _agg_df[color] = i
        source.append(_agg_df)
    source = pd.concat(source, ignore_index=True)

    _df1 = df.groupby(['Survey Year', column], as_index=False).size()
    _df1.rename(columns={"size": "Total"}, inplace=True)
    source = pd.merge(source, _df1, on=['Survey Year', column])
    source["Prop"] = source["Count"] / source["Total"]
    return source


def _to_categorical(df: pd.DataFrame, col: str) -> pd.DataFrame:
    df[col] = df[col].apply(lambda x: x if x in CATEGORIES[col] else "Other")
    cat_dtype = pd.api.types.CategoricalDtype(
        categories=CATEGORIES[col], ordered=True)
    df[col] = df[col].astype(cat_dtype)
    return df


class LinePlot:
    """
    Creates a plotly plot
    """
    def __init__(
        self,
        df: pd.DataFrame,
        x: str,
        y: str,
        column: str,
        color: str,
        title: str,
        ytitle: str = None,
        yticks: List[float] = None,
        yticks_template: str = None,
        hover_template: str = None,
        annotation_template: str = None,
        annotation_mode: int = 0,
        x_nticks: int = 3,
        shared_yaxes: bool = True,
        highlighted_traces: list = [],
    ) -> None:
        pyo.init_notebook_mode()

        df = _to_categorical(df, x)
        df = _to_categorical(df, column)

        self.df = df
        self.x = x
        self.y = y
        self.column = column
        self.color = color
        self.title = title

        self.ytitle = ytitle or _STYLE[y]["ytitle"]
        self.yticks = yticks or _STYLE[y]["yticks"]
        self.yticks_template = yticks_template or _STYLE[y]["yticks_template"]
        self.hover_template = hover_template or _STYLE[y]["hover_template"]
        self.annotation_template = annotation_template or _STYLE[y]["annotation_template"]
        self.annotation_mode = annotation_mode or _STYLE[y]["annotation_mode"]

        self.x_nticks = x_nticks
        self.shared_yaxes = shared_yaxes
        self.highlighted_traces = highlighted_traces

        self.subplots_names = CATEGORIES[self.column]
        self.n_subplots = len(self.subplots_names)
        self.traces = self.get_traces()

        self.figure = go.Figure()
        self.range = (0, 0)

    def get_traces(self):
        self.df["subplot_row"] = 1
        self.df["subplot_col"] = self.df[self.column].cat.codes + 1

        lst = []
        for _, row in self.df.iterrows():
            filtered_df = self.df[self.df[self.column] == row[self.column]]
            filtered_df = filtered_df[filtered_df[self.color] == row[self.color]]

            metric_data = MetricData(
                subplot_name=row[self.column],
                trace_name=row[self.color],
                y_values=filtered_df[self.y].values,
                x_values=filtered_df[self.x].values,
                subplot_row=row["subplot_row"],
                subplot_col=row["subplot_col"],
                highlighted_traces=row[self.color] in self.highlighted_traces
            )
            lst.append(metric_data)
        return lst

    def make_subplots(self) -> None:
        """
        Creates subplots in the figure and add titles
        """
        self.figure = make_subplots(
            cols=self.n_subplots,
            rows=1,
            subplot_titles=self.subplots_names,
            specs=[[{"type": "scatter"}] * self.n_subplots],
            shared_yaxes=self.shared_yaxes,
            shared_xaxes=True
        )

        for idx, subplot_title in enumerate(self.figure["layout"]["annotations"]):
            subplot_title["font"] = dict(size=14, color="grey")
            subplot_title["align"] = "left"
            subplot_title["xanchor"] = "left"
            subplot_title["x"] = 0
            subplot_title["xref"] = "x" if idx == 0 else f"x{idx + 1}"

    def update_common_layout(self) -> None:
        """
        Updates general layout characteristics
        """
        self.figure.update_layout(
            showlegend=False,
            plot_bgcolor="white",
            title_text=self.title,
            title_font_size=15,
            margin_t=200,
            height=600,
            width=800,
            yaxis_range=self.range
        )

    def highlight_color(self, trace: MetricData) -> str:
        """
        Returns the highlight color
        """
        return COLORS[trace.trace_name]

    def add_data(self) -> None:
        """
        Adds a trace to the figure following the same standard for each trace
        """
        # Add all non-highlighted traces.
        for trace in self.traces:
            self.add_trace(trace=trace)
            self.update_range(data=trace.y_values)

    def add_trace(self, trace: MetricData) -> None:
        """
        Adds a new trace to a figure
        """
        self.figure.add_trace(
            go.Scatter(
                x=trace.x_values,
                y=trace.y_values,
                mode="lines",
                name=trace.trace_name,
                hoverinfo="name+text+y",
                hovertemplate=self.hover_template,
                text=trace.x_values,
                line_color=(
                    self.highlight_color(trace=trace) if trace.highlighted_traces
                    else "lightslategrey"
                ),
                showlegend=False,
                opacity=0.8 if trace.highlighted_traces else 0.25,
                line_shape="linear",
                line_width=1.6 if trace.highlighted_traces else 0.6,
                connectgaps=True
            ),
            trace.subplot_row,
            trace.subplot_col
        )

    def update_range(self, data: List[float]) -> None:
        """
        Updates the range to be 90% of minimum values and 110% of maximum value of all traces
        """
        if len(data) == 0:
            return

        max_range = max(data) * 1.2
        min_range = min(data) * 0.8
        self.range = (min(min_range, self.range[0]), max(max_range, self.range[1]))

    def get_annotation_text(self, trace: MetricData, idx: int) -> str:
        """
        Calculates the annotation text to be added to the plot
        """
        if (
            (trace.subplot_col == 1 and idx == 0)
            or self.annotation_mode == 3
        ):
            template = "{}<br>" + f"{self.annotation_template}"
            return template.format(trace.trace_name, trace.y_values[idx])
        else:
            return self.annotation_template.format(trace.y_values[idx])

    def _add_annotation(
        self, trace: MetricData, idx: int,
        xshift=None, xanchor=None, yshift=None, yanchor=None,
        text=None,
    ) -> None:
        """
        Adds annotation to the plot
        """
        self.figure.add_annotation(
            xref=f"x{trace.subplot_col}",
            yref=f"y{trace.subplot_col}",
            font=dict(
                size=11,
                color=self.highlight_color(trace=trace)
            ),
            opacity=0.8,
            align="right",
            xshift=xshift,
            xanchor=xanchor,
            yshift=yshift,
            yanchor=yanchor,
            x=trace.x_values.codes[idx],
            y=trace.y_values[idx],
            text=text if text is not None else self.get_annotation_text(trace=trace, idx=idx),
            showarrow=False
        )

    def add_annotations(self) -> None:
        """
        Adds annotations to the plot
        """
        if self.annotation_mode == 0:
            for trace in self.traces:
                if trace.highlighted_traces:
                    self._add_annotation(trace, -1, xshift=3, yshift=0, xanchor="right", yanchor="bottom")
        elif self.annotation_mode == 1:
            for trace in self.traces:
                if trace.highlighted_traces:
                    self._add_annotation(
                        trace, 0, xshift=0, yshift=8, xanchor="left", yanchor="bottom")
                    self._add_annotation(
                        trace, -1, xshift=3, yshift=-8, xanchor="right", yanchor="top")
        elif self.annotation_mode == 2:
            for trace in self.traces:
                if trace.highlighted_traces:
                    if trace.subplot_col == 1:
                        self._add_annotation(
                            trace, 0, xshift=0, yshift=5, xanchor="left", yanchor="bottom",
                            text=trace.trace_name)
        elif self.annotation_mode == 3:
            for trace in self.traces:
                if trace.highlighted_traces:
                    self._add_annotation(trace, 0, xshift=-3, yshift=0, xanchor="right")
                    self._add_annotation(trace, -1, xshift=3, yshift=0, xanchor="left")
        else:
            raise NotImplementedError

    def add_subplot_axis_annotation(self) -> None:
        """
        Add subplot axis annotation
        """
        self.figure.add_annotation(
            xref="x",
            yref="paper",
            font=dict(
                size=14,
                # color="lightgrey"
            ),
            align="left",
            x=0,
            xanchor="left",
            y=1.05,
            yanchor="bottom",
            text=f"{self.column}",
            showarrow=False
        )

    def update_subplots_layout(self) -> None:
        """
        Updates scatter subplots layout characteristics
        """
        for subplot_idx in range(self.n_subplots):
            self.figure.update_xaxes(
                type="category",
                color="lightgrey",
                showgrid=False,
                visible=subplot_idx == 0,  # Visible only for first subplot
                row=1,
                dtick=3,
                col=subplot_idx + 1,  # 1-based
            )
            self.figure.update_yaxes(
                showgrid=False,
                visible=subplot_idx == 0 or not self.shared_yaxes,
                title=self.ytitle if subplot_idx == 0 else None,
                # color="grey",
                row=1,
                col=subplot_idx + 1,
                tickvals=self.yticks,
                ticktext=[self.yticks_template.format(i) for i in self.yticks],
                tickmode="array",
                tickfont_color="lightgrey",
                autorange=True
            )

    def show(self) -> None:
        """
        Renders and shows the plot
        """
        self.make_subplots()
        self.update_common_layout()
        self.add_data()
        self.add_annotations()
        self.add_subplot_axis_annotation()
        self.update_subplots_layout()
        self.figure.show()


In [None]:
combined_survey = CombinedSurvey()
df = combined_survey.df
print(df.shape)

base_year = 2018
x = "Survey Year"

## Five charts on impact of pandemic on compensation

In [None]:
column = "Gender"
color = "Country"
value = "Compensation Value"

df1 = df.groupby([x, column, color], as_index=False)[value].mean()
df2 = df.groupby([x, column], as_index=False)[value].mean()
df2[color] = "Average"
df1 = pd.concat([df1, df2], ignore_index=True)

_df1 = df1[df1[x] == base_year][[column, color, value]].copy()
base_col = f"{value} {base_year}"
_df1.rename(columns={value: base_col}, inplace=True)
source = pd.merge(df1, _df1, on=[column, color])
source["Change"] = source[value] / source[base_col] - 1

In [None]:
LinePlot(
    df=source,
    x=x,
    y="Change",
    column=column,
    color=color,
    highlighted_traces=["Average"],
    title=(
        """
        <b>Average annual compensation decreases across the board in 2021 during the pandemic.</b>
        <br>The drop seems to be more severe for women.
        <br><span style="font-size:14px;color:grey"><i>Base Year=2018</i></span>
        """
    ),
).show()

In [None]:
column = "Gender"
color = "Age"
value = "Compensation Value"

df1 = df.groupby([x, column, color], as_index=False)[value].mean()

_df1 = df1[df1[x] == base_year][[column, color, value]].copy()
base_col = f"{value} {base_year}"
_df1.rename(columns={value: base_col}, inplace=True)
source = pd.merge(df1, _df1, on=[column, color])
source["Change"] = source[value] / source[base_col] - 1

In [None]:
LinePlot(
    df=source,
    x=x,
    y="Change",
    column=column,
    color=color,
    highlighted_traces=["22-24", "40-44"],
    title=(
        """
        <b>The younger age group suffers a bigger reduction in pay regardless of gender.</b>
        <br><span style="font-size:14px;color:grey"><i>Base Year=2018</i></span>
        """
    ),
).show()

In [None]:
column = "Education"
color = "Gender"
value = "Compensation Value"

df1 = df.groupby([x, column, color], as_index=False)[value].mean()

_df1 = df1[df1[x] == base_year][[column, color, value]].copy()
base_col = f"{value} {base_year}"
_df1.rename(columns={value: base_col}, inplace=True)
source = pd.merge(df1, _df1, on=[column, color])
source["Change"] = source[value] / source[base_col] - 1

In [None]:
LinePlot(
    df=source,
    x=x,
    y="Change",
    column=column,
    color=color,
    highlighted_traces=["Female", "Male"],
    title=(
        """
        <b>Most women saw bigger reduction in pay during the pandemic across education groups.</b>
        <br><span style="font-size:14px;color:grey"><i>Base Year=2018</i></span>
        """
    ),
).show()

### Definition of Profile

Inspired by [tkubacka/a-story-told-through-a-heatmap](https://www.kaggle.com/tkubacka/a-story-told-through-a-heatmap), we use the values of `Years Writing Code` and `Years Using ML` to create a new feature called `profile`

- `Beginners`: Years Writing Code between 0-3 years, Years Using ML between 0-3 years
- `Modern DS`: Years Writing Code between 3-10 years, Years Using ML between 3-10 years
- `Coders`: Years Writing Code for 10+ years, Years Using ML between 0-3 years
- `ML Veterans`: Years Writing Code for 10+ years, Years Using ML for 10+ years

In [None]:
column = "Profile"
color = "Gender"
value = "Compensation Value"

df1 = df.groupby([x, column, color], as_index=False)[value].mean()

_df1 = df1[df1[x] == base_year][[column, color, value]].copy()
base_col = f"{value} {base_year}"
_df1.rename(columns={value: base_col}, inplace=True)
source = pd.merge(df1, _df1, on=[column, color])
source["Change"] = source[value] / source[base_col] - 1

In [None]:
LinePlot(
    df=source,
    x=x,
    y="Change",
    column=column,
    color=color,
    highlighted_traces=["Female", "Male"],
    title=(
        """
        <b>But women who are `Beginners` and `Others` saw bigger reduction in pay.</b>
        <br><span style="font-size:14px;color:grey"><i>Base Year=2018</i></span>
        """
    ),
).show()

In [None]:
column = "Profile"
color = "Job Title"
value = "Compensation Value"

df1 = df.groupby([x, column, color], as_index=False)[value].mean()

_df1 = df1[df1[x] == base_year][[column, color, value]].copy()
base_col = f"{value} {base_year}"
_df1.rename(columns={value: base_col}, inplace=True)
source = pd.merge(df1, _df1, on=[column, color])
source["Change"] = source[value] / source[base_col] - 1

In [None]:
LinePlot(
    df=source,
    x=x,
    y="Change",
    column=column,
    color=color,
    highlighted_traces=["Software Engineer", "Data Engineer/DBA"],
    title=(
        """
        <b>Reduction in pay seems to be worse for software engineers.</b>
        <br><span style="font-size:14px;color:grey"><i>Base Year=2018</i></span>
        """
    ),
).show()

## Eight charts on commonly used tools

In [None]:
column = "Profile"
color = "Languages"

source = compute_prop(
    df, column, color, list(set(Mapping.LANGUAGES.value.values())) + ["None"]
)

In [None]:
LinePlot(
    df=source,
    x=x,
    y="Prop",
    column=column,
    color=color,
    highlighted_traces=["Python", "SQL", "R"],
    title=(
        """
        <b>Python remains the most widely used tool across the years while R has seen a drop.</b>
        <br><span style="font-size:14px;color:grey"><i>Base Year=2018</i></span>
        """
    ),
).show()

In [None]:
column = "Profile"
color = "IDEs"

source = compute_prop(
    df, column, color, list(set(Mapping.IDE.value.values())) + ["None"]
)

In [None]:
LinePlot(
    df=source,
    x=x,
    y="Prop",
    column=column,
    color=color,
    highlighted_traces=["Jupyter/IPython", "Visual Studio", "RStudio"],
    title=(
        """
        <b>Similarly, Jupyter/IPython is still the most popular IDE while RStudio has dropped.</b>
        <br><span style="font-size:14px;color:grey"><i>Base Year=2018</i></span>
        """
    ),
).show()

In [None]:
column = "Profile"
color = "ML Frameworks"

source = compute_prop(
    df, column, color, list(set(Mapping.ML_FRAMEWORKS.value.values())) + ["None"]
)

In [None]:
LinePlot(
    df=source,
    x=x,
    y="Prop",
    column=column,
    color=color,
    highlighted_traces=["Scikit-learn", "TensorFlow", "PyTorch", "Keras"],
    title=(
        """
        <b>Scikit-learn remains the most widely used tool while PyTorch has been gaining new ground.</b>
        <br><span style="font-size:14px;color:grey"><i>Base Year=2018</i></span>
        """
    ),
).show()

In [None]:
column = "Profile"
color = "Data Visualization Libraries"

source = compute_prop(
    df, column, color, list(set(Mapping.VIZ.value.values())) + ["None"]
)

In [None]:
LinePlot(
    df=source,
    x=x,
    y="Prop",
    column=column,
    color=color,
    highlighted_traces=["Matplotlib", "Seaborn"],
    title=(
        """
        <b>Matplotlib and Seaborn are the favourite data visualization libraries.</b>
        <br><span style="font-size:14px;color:grey"><i>Base Year=2018</i></span>
        """
    ),
).show()

In [None]:
column = "Profile"
color = "Cloud Computing Platforms"

source = compute_prop(
    df, column, color, list(set(Mapping.CLOUD.value.values())) + ["None"]
)

In [None]:
LinePlot(
    df=source,
    x=x,
    y="Prop",
    column=column,
    color=color,
    highlighted_traces=["AWS", "Azure", "GCP"],
    title=(
        """
        <b>AWS is still the most popular cloud computing platform but many are not using any.</b>
        <br><span style="font-size:14px;color:grey"><i>Base Year=2018</i></span>
        """
    ),
).show()

In [None]:
column = "Profile"
color = "Data Science Courses"

source = compute_prop(
    df, column, color, list(set(Mapping.COURSES.value.values())) + ["None"]
)

In [None]:
LinePlot(
    df=source,
    x=x,
    y="Prop",
    column=column,
    color=color,
    highlighted_traces=["Coursera", "Kaggle Learn"],
    title=(
        """
        <b>Coursera has been a favourite place for data science courses but Kaggle Learn has been gaining popularity.</b>
        <br><span style="font-size:14px;color:grey"><i>Base Year=2018</i></span>
        """
    ),
).show()

In [None]:
column = "Profile"
color = "Media Sources"

source = compute_prop(
    df, column, color, list(set(Mapping.MEDIA.value.values())) + ["None"]
)

In [None]:
LinePlot(
    df=source,
    x=x,
    y="Prop",
    column=column,
    color=color,
    highlighted_traces=["Journals", "Kaggle"],
    title=(
        """
        <b>Kaggle has been a favourite information source but Journals are also important for ML Veterans.</b>
        <br><span style="font-size:14px;color:grey"><i>Base Year=2018</i></span>
        """
    ),
).show()