In [1]:
!pip install -U pandas
!pip install -U altair
!pip install -U datasette
!pip install -U sqlite-utils
!pip install -U sqlalchemy



In [2]:
import pandas as pd
import sqlite_utils
import altair as alt
import re
from sqlalchemy import create_engine
import json
import os

In [3]:
# refresh the db if it exists ...
try:
    os.remove('cincy-tech-survey.db')
except:
    pass

os.close(os.open('cincy-tech-survey.db', os.O_CREAT))

engine = create_engine("sqlite:///cincy-tech-survey.db")

In [4]:
# grab the survey results as a dataframe
df = pd.read_csv(
    'Cincinnati Developer Survey 2017 (Responses) - Form Responses 1.csv',
    parse_dates=True,
)

In [5]:
df.columns

Index(['Timestamp', 'Years of Experience', 'Size of company', 'Current Salary',
       'How happy are you at your company?', 'Gender (Optional)',
       'Position or Title', 'Languages / Skills',
       'Do you work remotely or for a company in town?', 'Level of education',
       'If you have a degree, what is it in?'],
      dtype='object')

In [6]:
# normalize the column names
df.columns = ['timestamp', 'years_exp', 'company_size', 
              'current_salary', 'happy_score', 
              'gender', 'title', 'skills',
               'remote', 'education', 'degree']

In [7]:
# normalize the salary into a numeric value
# assume 2087 average work hours in a calendar year for hourly rates

re_per_hour = re.compile(r"^.*(\/hr|\/hour|per\shour|\sper\shr|\s\/\shr).*$")
re_eval_num = re.compile(r"[0-9\.]{1,}")

def salary_to_int(salary):
    if pd.isnull(salary):
        # return f"no salary"
        return False
    elif re_per_hour.match(salary):
        # return f"re_per_hour: {salary}"
        return eval(re_eval_num.findall(salary)[0]) * 2087
    else:
        # return f"per year salary: {salary}"
        return eval(''.join([value for value in salary if re_eval_num.match(value)]))

In [8]:
# add the column and add the converted values
df['salary_norm'] = df['current_salary'].apply(lambda x: salary_to_int(x))

In [9]:
df.to_sql('results_2017', con=engine)

In [10]:
# clean up the blank rows
with engine.connect() as con:
    con.execute('''\
        DELETE FROM
        results_2017
        WHERE
        timestamp is NULL
    ''')

In [11]:
# grab the survey results as a dataframe

# original columns
# Index(['Timestamp', 'Years of Experience', 'Size of company', 'Current Salary',
#        'How happy are you at your company?', 'Gender (Optional)',
#        'Position or Title', 'Languages / Skills',
#        'Do you work remotely or for a company in town?', 'Level of education',
#        'If you have a degree, what is it in?'],
#       dtype='object')
df = pd.read_csv(
    'Cincinnati Developer Survey 2019 (Responses) - Form Responses 1.csv',
    parse_dates=True,
)

In [12]:
df.columns

Index(['Timestamp', 'Years of Experience', 'Size of company', 'Current Salary',
       'How happy are you at your company?', 'Gender (Optional)',
       'Position or Title', 'Languages / Skills',
       'Do you work remotely or for a company in town?', 'Level of education',
       'If you have a degree, what is it in?', 'Race and Ethnicity'],
      dtype='object')

In [13]:
# normalize the column names
df.columns = ['timestamp', 'years_exp', 'company_size', 
              'current_salary', 'happy_score', 
              'gender', 'title', 'skills',
               'remote', 'education', 'degree', 'ethnicity']

In [14]:
# add the column and add the converted values
df['salary_norm'] = df['current_salary'].apply(lambda x: salary_to_int(x))

In [15]:
df.to_sql('results_2019', con=engine)

In [16]:
# clean up the blank rows
with engine.connect() as con:
    con.execute('''\
        DELETE FROM
        results_2019
        WHERE
        timestamp is NULL
    ''')

In [17]:
df = pd.read_csv(
    'Cincinnati Developer Survey 2021 (Responses) - Form Responses 1.csv',
    parse_dates=True,
)

In [18]:
df.columns

Index(['Timestamp', 'Years of Experience', 'What do you do in tech?',
       'Size of company', 'Company Name (100% Optional)',
       'Current Salary (If hourly calculate as yearly)',
       'How happy are you at your company?', 'Gender (Optional)',
       'Position or Title', 'Languages / Skills',
       'Do you work remotely or for a company in town?', 'Level of education',
       'If you have a degree, what is it in?'],
      dtype='object')

In [19]:
# normalize the column names
df.columns = ['timestamp', 'years_exp', 'tech_interest',
              'company_size', 'company_name',
              'current_salary', 
              'happy_score',  'gender', 
              'title', 'skills',
              'remote', 'education', 
              'degree', 
              ]

In [20]:
# add the column and add the converted values
df['salary_norm'] = df['current_salary'].apply(lambda x: salary_to_int(x))

In [21]:
df.to_sql('results_2021', con=engine)

In [22]:
# clean up the blank rows
with engine.connect() as con:
    con.execute('''\
        DELETE FROM
        results_2021
        WHERE
        timestamp is NULL
    ''')

In [23]:
# create the view for all results
sql = """\
create view all_years_results_view as

with all_years as (
  select
    years_exp,
    company_size,
    happy_score,
    gender,
    skills,
    salary_norm,
    remote,
    2017 as results_year
  from
    results_2017
  union all
  select
    years_exp,
    company_size,
    happy_score,
    gender,
    skills,
    salary_norm,
    remote,
    2019 as results_year
  from
    results_2019
  union all
  select
    years_exp,
    company_size,
    happy_score,
    gender,
    skills,
    salary_norm,
    remote,
    2021 as results_year
  from
    results_2021
)
select
  years_exp,
    company_size,
    happy_score,
    gender,
    skills,
    salary_norm,
    remote,
    results_year
from
  all_years
"""

with engine.connect() as con:
    con.execute(sql)

In [24]:
# create some full text searches for title / skills on the tables

utils_db = sqlite_utils.Database('cincy-tech-survey.db')

utils_db["results_2021"].enable_fts(['title', 'skills'])
# just to double check if the table now has fts enabled ...
print(utils_db["results_2021"].detect_fts())

utils_db["results_2019"].enable_fts(['title', 'skills'])
# just to double check if the table now has fts enabled ...
print(utils_db["results_2019"].detect_fts())

utils_db["results_2017"].enable_fts(['title', 'skills'])
# just to double check if the table now has fts enabled ...
print(utils_db["results_2017"].detect_fts())

#  enable_fts() is supported on tables but not on views
# utils_db["all_years_results_view"].enable_fts(['title', 'skills'])
# print(utils_db["results_2017"].detect_fts())

results_2021_fts
results_2019_fts
results_2017_fts


In [25]:
# produce some settings for Datasette
startup = '''\
datasette \
    cincy-tech-survey.db \
    --setting allow_facet on \
    --setting suggest_facets on \
    --setting allow_facet true \
    --metadata=metadata.json \
    --static static:static/ \
    --port 8010
'''


In [26]:
description_html = """\
<p>
    Results from Cincy Tech Slack Developer Survey Years: 
    <span style="color:#54AC8E;">
    <a href="./cincy-tech-survey/results_2021">2021</a>, 
    <a href="./cincy-tech-survey/results_2019">2019</a>, 
    <a href="./cincy-tech-survey/results_2017">2017</a>
</p>

<p>
    Check out some cool data visualizations generated from this dataset:
    <ul>
        <li>
            <a href="./static/CincyTechSurveyHappinessbyRemoteStatus.html">Happiness By Remote Status</a>
        </li>
        <li>
            <a href="./static/CincyTechSurveyHappinessbyYearsExperience.html">Happiness By Years Experience</a>
        </li>
        <li>
            <a href="./static/CincyTechSurveySalaryRangeByYearsExp.html">Salary Range By Years Experience</a>
        </li>
    </ul>
</p>
<br />


"""

all_years_results_sql = """\
with all_years as (
  select
    years_exp,
    company_size,
    happy_score,
    gender,
    skills,
    salary_norm,
    2017 as results_year
  from
    results_2017
  union all
  select
    years_exp,
    company_size,
    happy_score,
    gender,
    skills,
    salary_norm,
    2019 as results_year
  from
    results_2019
  union all
  select
    years_exp,
    company_size,
    happy_score,
    gender,
    skills,
    salary_norm,
    2021 as results_year
  from
    results_2021
)
select
  *
from
  all_years
"""

json_metadata = {
    'title': 'Cincy Tech Slack - Dev Survey',
    'description_html': description_html,
    'extra_css_urls': ['/static/my.css', ],
    'databases': {
        'cincy-tech-survey': {
            'queries': {
                'all_years_results': {
                    'sql':  all_years_results_sql,
                    'title': 'Use this query to get results of all years'
                }
            }
        }
    }
}

with open('metadata.json', 'w') as f:
    f.write(json.dumps(json_metadata))