In [None]:
!pip install icecream
!pip install kaleido

# **<font color='orange'> Stack Overflow </font>  Developer Survey** 
---

In [None]:
import pandas as pd
import numpy as np
import missingno as msno
from icecream import ic
import os
import re
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.figure_factory as ff
from kaleido.scopes.plotly import PlotlyScope
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
import warnings
warnings.filterwarnings(action='ignore')
pd.options.display.max_columns = 100


# plotly settings and functions
scope = PlotlyScope(plotlyjs="https://cdn.plot.ly/plotly-latest.min.js")
pio.templates.default = 'plotly_white'

In [None]:
def paste_px_format(figure, **kwargs):
    """Updates Layout of the Figure with custom setting"""
    return figure.update_layout(**kwargs,
        font={'color': 'Gray', 'size': 10},
        width=780, margin={'pad': 10})


def save_figure(fig, name):
    with open('/kaggle/working/' + name, "wb") as f:
        save = scope.transform(fig, format="png")
        f.close()
        
        
def add_bubble(fig, **kwargs):
    """Creates shape ontop of the figure"""
    return fig.add_shape(
        type="circle",
        line_color="white", 
        fillcolor="orange",
        opacity=0.6,
        xref='paper', yref='paper',
        x0=0.5, y0=0.6)


def prepare_data_salary(df, col)-> pd.DataFrame:
    """
    Returns a cross tab of selected for col parameter
        as a pandas dataframe.
    """
    df_concat = pd.concat(
        [df[col], survey['CompTotal']], axis=1)\
        .dropna()
    
    df_concat['salary_cat'] = pd.qcut(
        df_concat['CompTotal'], 5, labels=[
        "Low(<10,000)", "Low-Med(10k-49k)",
        "Medium(49k-85k)", "High(85k-150k)",
        "Very High(150<)"])
    
    crosstab = pd.crosstab(
        df_concat[col],
        df_concat['salary_cat'])
    
    return crosstab


def get_files():
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            return (os.path.join(dirname, filename))

# Read Survey Data

---

In [None]:
# load schema and survey data
schema = pd.read_csv('/kaggle/input/stack-overflow-developer-survey-2020/survey_results_schema.csv')
survey = pd.read_csv('/kaggle/input/stack-overflow-developer-survey-2020/survey_results_public.csv')

# set schema 
schema = schema.T.reset_index().drop(columns='index')
schema.columns = schema.iloc[0].to_list()
schema = schema.drop(index=[0])
schema = schema.T.rename(columns={1: 'description'})

## The Schema 

---

In [None]:
def find_question(question: str)-> list:
    """Search for Question in Schema Data"""
    num_quest = len(question)

    match = []
    for find in question:
        output = schema.query(f'description ==  "{find}"').index.tolist()
        match.append(output)
    
    def flatten(t):
        return [item for sublist in t for item in sublist]
    
    return flatten(match)



In [None]:
schema.T

## The Data

---

In [None]:
survey.head(3)

# **Survey <font color='orange'>Missing Data</font>**

---

In [None]:
def missing_bar()-> go.Figure:
    """Plots Missing Data for Whole Dataset."""
    title = 'Survey <b>Missing</b> Data by Features'
    
    # counts missing data
    missing_data = survey.isna().sum()
    missing_data = missing_data.to_frame().reset_index().rename(
        columns={'index': 'data_cols', 0: 'counts'})
    missing_data = missing_data.sort_values(by='counts', ascending=False)
    missing_perc = np.round(
        (survey.isna().sum().sum() / survey.size) * 100, 2)
    
    # figure colors
    colors = ['Gray'] * len(missing_data)
    colors[:10] = ['Orange']
    
    
    # create figure
    fig = go.Figure()
    for labels, values\
    in zip(missing_data.data_cols.to_list(), missing_data.counts):
    
        fig.add_trace(go.Bar(
            y=[labels],
            x=[values],
            name=labels,
            orientation='h'))
    
    # tweak layout
    fig.update_traces(marker_color=colors)
    fig.update_xaxes(title='Missing Counts')
    fig.update_yaxes(title='Features', tickmode='linear')
    
    fig.add_annotation(xref='paper', yref='paper',
        x=0.71, y=0.70, text=f"""
            {missing_perc}%""",
        font={'size': 20, 'color': 'White'},
        showarrow=False)
    
    fig.add_annotation(xref='paper', yref='paper',
        x=0.68, y=0.67, text=f"""Missing""",
        font={'size': 15, 'color': 'Gray'},
        showarrow=False)
    
    add_bubble(fig)

    return paste_px_format(
        fig, title=title, height=1000, showlegend=False)

In [None]:
missing_bar()

In [None]:
save_figure(missing_bar(), 'missing_data.png')

# **StackOverflow <font color='orange'>Demographics**</font>

---

In [None]:
# reference question from pdf file
demo_question = [
    "What is your age (in years)? If you prefer not to answer, you may leave this question blank.",
    "Which of the following describe you, if any? Please check all that apply. If you prefer not to answer, you may leave this question blank.",
    "Are you transgender?"]

# use custom function
demographics = find_question(demo_question)
demographics = survey[demographics]

## Gender

In [None]:
# create frequency of gender
gender = demographics['Gender']\
    .value_counts()\
    .to_frame()

# create lgbtq index 
lgbtq = gender.iloc[2:].sum(axis=0)\
    .to_frame()\
    .rename(index={'Gender': 'LGBTQ+'}, columns={0:'Gender'})

# merge data
gender = pd.concat([gender.iloc[:2], lgbtq])
gender = gender.T

In [None]:
def plot_gender()-> go.Figure:
    """"""
    title = '<b>Demographic</b> | Gender'
    
    # colors
    colors = ['Gray'] * 3
    colors[0] = 'Orange'
    
    # create figure
    c = 0
    fig = go.Figure()
    for cols in gender:
        fig.add_trace(go.Bar(
            x=[gender[cols].name],
            y=[gender[cols].values[0]],
            name=gender[cols].name,
            textposition='outside',
            text=str(gender[cols].values[0]),
            marker_color=colors[c]))
        c += 1

    return paste_px_format(fig, title=title)

In [None]:
plot_gender()

## Age

---

In [None]:
max_quant = survey['Age'].quantile(0.99)
min_quant = survey['Age'].quantile(0.01)
survey_age = survey[survey['Age'] <= max_quant]
survey_age = survey_age[survey_age['Age'] >= min_quant]

In [None]:
def plot_age()-> go.Figure:
    """Plots Age Histogram"""
    title = f'<b>Demographics</b> | Age<br> n = {len(survey_age)}'
    fig = px.histogram(survey_age['Age'])
    show = paste_px_format(fig)\
        .update_layout(title=title)\
        .update_traces(marker_color='orange')\

    return show

plot_age()

## Sexuality

---

In [None]:
# sexuality values to dataframe
sexuality = survey['Sexuality'].value_counts().to_frame().T 

# wrangle data
sexuality_hetero = sexuality['Straight / Heterosexual'].to_frame()
sexuality_lgbtq = sexuality.iloc[:, 2:]
sexuality_lgbtq = sexuality_lgbtq.sum(axis=1)\
    .to_frame()\
    .rename(columns={0: 'LGBTQ+'})

# prepare data for plot
sexuality = pd.concat([sexuality_hetero, sexuality_lgbtq], axis=1)

In [None]:
def plot_donut()-> go.Figure:
    """Plots Donut Graph for sexuality"""
    title="<b>Demographics</b> | Sexuality"
    # create labels and values
    labels = sexuality.columns.tolist()
    values = np.ravel(sexuality.values).tolist()
    
    # create figure
    fig = go.Figure()
    fig.add_trace(go.Pie(
        labels=labels,
        values=values,
        hole=0.2))

    # tweak layout
    fig.update_traces(
        marker_colors=['orange', 'lightgray'],
        marker=dict(line=dict(color='white', width=1)))

    return paste_px_format(fig, title=title)

In [None]:
plot_donut()

## Ethnicity

In [None]:
colors = ["#efc69b","#f4b9c8","#473144","#ccb69b","#df9b6d"]
ethnicity =demographics['Ethnicity'].value_counts().to_frame().T
ethnicity_groups = ethnicity.columns.tolist()
e_groups = pd.Series(ethnicity_groups).str.split(';', expand=True)

In [None]:
def plot_treemap()->go.Figure:
    """Plots Treemap for Ethnicity"""
    title = '<b>Demographic</b> | Ethnicity'
    ethnicity_majority = ethnicity.iloc[:, :10]

    # list
    labels = ethnicity_majority.columns.tolist()
    values = np.ravel(ethnicity_majority.T.values)

    # sort
    l_sort = np.flip(np.asarray(labels)).tolist()
    v_sort = sorted(values)

    # prepare for treemap
    values = [0]
    values += v_sort
    labels = ['Ethnicity']
    labels += l_sort
    parents = ['', 'Ethnicity']
    parents += l_sort


    # fig
    fig = go.Figure(go.Treemap(
        labels = labels,
        values = values,
        parents=parents,
        root_color = 'lightgray'
    ))

    return paste_px_format(fig, title=title,
        treemapcolorway=['orange'])

In [None]:
plot_treemap()

In [None]:
save_figure(plot_treemap(), 'demographics_ethnicity.png')
save_figure(plot_donut(), 'demographics_sexuality.png')
save_figure(plot_gender(), 'demographics_gender.png')

# **Developers'<font color='orange'> Education </font> and Career**

---

In [None]:
 education_career_questions = ["What was your primary field of study?",
    "How important is a formal education, such as a university degree in computer science, to your career?",
    "If you could go back and change your educational path (but end up in the same career), what would you change?",
    "At what age did you write your first line of code or program? (e.g., webpage, Hello World, Scratch project)",
    "Including any education, how many years have you been coding in total?",
    "NOT including education, how many years have you coded professionally (as a part of your work)?",
    "Which of the following describe you? Please select all that apply.",
    "What industry or industries do you work in? This information will be kept private.",
    "How satisfied are you with your current job? (If you work multiple jobs, answer for the one you spend the most hours on.)",
    "Approximately how many people are employed by the company or organization you currently work for?",
    "* Which currency do you use day-to-day? If your answer is complicated, please pick the one you're most comfortable estimating in",
    "What is your current total compensation (salary, bonuses, and perks, before taxes and deductions), in ${q://QID50/ChoiceGroup/SelectedChoicesTextEntry}? Please enter a whole number in the box below, without any punctuation. If you are paid hourly, please estimate an equivalent weekly, monthly, or yearly salary. If you prefer not to answer, please leave the box empty.",
    "Is that compensation weekly, monthly, or yearly?",
    "On average, how many hours per week do you work? Please enter a whole number in the box.",
    "How often do you work overtime or beyond the formal time expectation of your job?",
    "Do you think your company has a good onboarding process? (By onboarding, we mean the structured process of getting you settled in to your new role at a company",
    "How could onboarding at your company be improved?",
    "Does your company have a dedicated DevOps person? ",
    "How important is the practice of DevOps to scaling software development?",
    "Which of the following best describes your current job-seeking status? *",
    "In general, what drives you to look for a new job? Select all that apply"
    "When job searching, how do you learn more about a company? Select all that apply",
    "Imagine that you are deciding between two job offers with the same compensation, benefits, and location. Of the following factors, which 3 are MOST important to you?"]

In [None]:
# get questions 
edu_career = find_question(education_career_questions)
edu_career = survey[edu_career]
salary = schema[schema.description.str.contains("What is your current total compensation")].T.columns[0]
edu_career = pd.concat([edu_career, survey[salary]], axis=1)

# wrangle dataframe
edu_salary = edu_career[['UndergradMajor', 'CompTotal']].dropna()
edu_salary['CompTotal'] = edu_salary['CompTotal'].astype('int')
edu_salary.drop(index=edu_salary[edu_salary['CompTotal'] < 0].index.tolist(), inplace=True)

# create salary bins
intervals = pd.qcut(edu_salary.CompTotal, 5).unique().tolist()
edu_salary['salary_cat'] = pd.qcut(edu_salary \
    .CompTotal, 5, labels=[
        "Low(<10,000)", "Low-Med(10k-49k)",
        "Medium(49k-85k)", "High(85k-150k)", "Very High(150<)"])

## Salary

---

In [None]:
# continuous color map
cmap = np.flip(["#ff9f1c","#ffbf69","#ffca85","#e6e6e6","#f0f0f0"])


def plot_heatmap(z,y,x, title, **kwargs)-> go.Figure:
    """Plots Annotated Plotly Heatmap."""
    fig = ff.create_annotated_heatmap(z=z,
        x=x, y=y, colorscale=cmap)
    fig.update_traces(xgap=1, ygap=1)
    return paste_px_format(fig, title=title, **kwargs)

In [None]:
edu_salary_cat = pd.crosstab(edu_salary['UndergradMajor'], edu_salary['salary_cat'])
zvals = edu_salary_cat.values
ylabel = edu_salary_cat.index.tolist()
xlabel = edu_salary_cat.columns.tolist()

ylabel_clean = []
for degrees in ylabel:
    clean = re.sub(r"\([^()]*\)", "", degrees)
    ylabel_clean.append(clean)
    
heatmap_degree_salary = plot_heatmap(
    z=np.flip(zvals), 
    y=ylabel_clean,
    x=xlabel, title='<b>Degree</b> | Salary')\
        .update_layout(margin=dict(t=100))

heatmap_degree_salary

## Belief of Formal Education and Salary

---

In [None]:
print(schema.T['NEWEdImpt'][0])

In [None]:
# create bins
belief_salary = edu_career[['NEWEdImpt', 'CompTotal']].dropna()
belief_salary['salary_cat'] = pd.qcut(belief_salary\
    .CompTotal, 5, labels=[
        "Low(<10,000)", "Low-Med(10k-49k)",
        "Medium(49k-85k)", "High(85k-150k)", "Very High(150<)"])

# prepare heatmap
belief_salary_cat = pd.crosstab(belief_salary['NEWEdImpt'], belief_salary['salary_cat'])
y = belief_salary_cat.index.tolist()
x = belief_salary_cat.columns.tolist()
z = belief_salary_cat.values
heatmap_belief_education = plot_heatmap(z=z, x=x, y=y,
    title='<b>Belief in Education</b> | Salary') \
    .update_layout(margin=dict(t=200))

heatmap_belief_education

In [None]:
# drop null values
age_first_code = edu_career[['Age1stCode', 'CompTotal']].dropna()

# create bins for salary
age_first_code['salary_cat'] = pd.qcut(age_first_code\
    .CompTotal, 5, labels=[
        "Low(<10,000)", "Low-Med(10k-49k)",
        "Medium(49k-85k)", "High(85k-150k)", "Very High(150<)"])

# use pandas cross tab
age_first_code_cat = pd.crosstab(
    age_first_code['Age1stCode'],
    age_first_code['salary_cat'])

# clean strings and convert to int
age_first_code['Age1stCode'] = age_first_code\
    .Age1stCode.apply(lambda age: age\
    .strip('Younger than')\
    .strip('years')\
    .strip('Older than'))

# create bins for age
age_first_code['Age1stCode'] = age_first_code.Age1stCode.astype('int')
age_first_code['age_first_code'] = pd.qcut(age_first_code.Age1stCode, 5,
        labels=['<5-12', '12-14', '14-16', '16-18', '18-85'])

# create crosstab for age and salary
age_first_cat = pd.crosstab(
    age_first_code['age_first_code'],
    age_first_code['salary_cat'])


# prepare data for plot
y = age_first_cat.index.tolist()
x = age_first_cat.columns.tolist()
z = age_first_cat.values
age_start_code_heatmap = plot_heatmap(z=z, x=x, y=y,
    title='<b>Age Started Coding</b> | Salary') \
    .update_layout(margin=dict(t=200))

age_start_code_heatmap

In [None]:
# get and clean years code and salary features
years_pro_salary = survey[['YearsCodePro', 'CompTotal']].dropna()
years_pro_salary['YearsCodePro'] = years_pro_salary.YearsCodePro\
    .apply(lambda year: year\
        .strip('More than')\
        .strip('Less than')\
        .strip('years')\
        .strip('year'))

# convert string feature to int
years_pro_salary['YearsCodePro'] = years_pro_salary.YearsCodePro.astype('int')

# prepare data for heatmap
years_pro_salary_crosstab = prepare_data_salary(years_pro_salary, 'YearsCodePro')
x = years_pro_salary_crosstab.columns.tolist()
y = years_pro_salary_crosstab.index.tolist()
z = years_pro_salary_crosstab.values

years_pro_salary_heatmap = plot_heatmap(
    x=x, y=y, z=z,
    title='<b> Years Pro </b> | Salary',
    height=800)

years_pro_salary_heatmap

# **Job <font color='orange'>Satisfaction</font>**

In [None]:
# wrangle data
job_sat_list = schema[schema.description.str.contains('job')].index.tolist()
job_sat = survey[job_sat_list]
job_sat_salary = prepare_data_salary(job_sat, 'JobSat')

# prepare values for heattmap
x = job_sat_salary.columns.tolist()
y = job_sat_salary.index.tolist()
z = np.asarray(job_sat_salary)
job_sat_salary = plot_heatmap(
    x=x, y=y, 
    z=z, title='<b> Job Satisfaction<b> | Salary') \
    .update_layout(margin=dict(t=200))

job_sat_salary

In [None]:
# save figures
save_figure(heatmap_belief_education, 'belief_education.png')
save_figure(heatmap_degree_salary, 'degree_salary.png')
save_figure(age_start_code_heatmap, 'age_start_code.png')
save_figure(years_pro_salary_heatmap, 'years_pro_salary.png')
save_figure(job_sat_salary, 'job_sat_salary.png')

# Job Satisfaction Feature Scores

In [None]:
features_question = [
    "Do you code as a hobby?*",
    "Which of the following best describes your current employment status?",
    "Which of the following describe you? Please select all that apply.",
    "Approximately how many people are employed by the company or organization you currently work for?",
    "Is that compensation weekly, monthly, or yearly?",
    "What is your age (in years)? If you prefer not to answer, you may leave this question blank."]

# get features
job_sat_features = find_question(features_question)
job_sat_features += ['CompTotal', 'Gender', 'JobSat']
job_df = survey[job_sat_features]

emp_status = ['Independent contractor, freelancer, or self-employed',
    'Employed full-time', 'Employed part-time']

# use only from emp_status list
df_emp_status = []
for emp_stats in emp_status:
    df = job_df[job_df['Employment'] == emp_stats]
    df_emp_status.append(df)

# concat
job_df = pd.concat(df_emp_status)

# clean age
max_quant = job_df['Age'].quantile(0.99)
min_quant = job_df['Age'].quantile(0.01)
job_df_age = job_df[job_df['Age'] <= max_quant]
X = job_df[job_df['Age'] >= min_quant]

# get features for training
y = X['JobSat']
y = y.fillna('Unknown')
X = X[X.columns[:-1].tolist()]

# select numerical and categorical cols
cat = [*X.select_dtypes('object').columns]
X[cat] = X[cat].fillna('Unknown')

# fill median for compensation feature
num = [*X.select_dtypes('number').columns]
salary_median = X['CompTotal'].median()
X['CompTotal'] = X.CompTotal.fillna(salary_median)

# use one hot encoder
encoder = OneHotEncoder()
encoded_df = pd.DataFrame(encoder.fit_transform(X[cat]).toarray())
encoded_df.columns = encoder.get_feature_names()

# use standard scaler
scale = StandardScaler()
scale_df = pd.DataFrame(scale.fit_transform(X[num]))
scale_df.columns = X[num].columns

# concat numerical and categorical 
X = pd.concat([encoded_df, scale_df], axis=1)

# fillnans from
kbset = SelectKBest(k=20)

# get feature scores
transformed_df = kbset.fit(X, y)
idx = np.sort(transformed_df.scores_)[::1]
feature_names = [*X.columns]
score = pd.DataFrame(transformed_df.scores_).T
score.columns = feature_names

In [None]:
score_df = score.T.sort_values(by=0, ascending=False)[:20]
score_df_T = score_df.T

for cols in score_df_T.columns:
    if cols.endswith('Unknown') :
        score_df_T.drop(columns=cols, inplace=True)
        
new_names = [cols[3:] if cols.startswith('x') else cols for cols in score_df_T.columns ]
score_df_T.columns = new_names

In [None]:
def plot_best()-> go.Figure:
    """Plot best features in predicting Job satisfaction"""
    title = """
    <b>Top Features in Predicting Job Satisfaction</b><br>
    Method, SelectKBest(k=20)"""

    # color scheme
    cmap_bar = []
    for i in range(0,5):
        cmap_bar.append([cmap[i]] * 4 )

    cmap_bar = np.ravel(cmap_bar)
    cmap_bar = np.flip(cmap_bar[:17])
    
    # create figure
    fig = go.Figure()

    # add features
    i = 0
    for features in score_df_T:
        
        fig.add_trace(go.Bar(
            x=score_df_T[features],
            y=[features],
            name=features,
            marker_color=cmap_bar[i],
            orientation='h'))
        i += 1

    return paste_px_format(fig)\
        .update_layout(title=title,showlegend=False)

In [None]:
plot_best()