In [4]:
import pandas as pd
import great_expectations as gx
from zipfile import ZipFile
import requests
from pathlib import Path

In [5]:
# !curl -Ls "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2022.zip" -o stack-overflow-developer-survey-2022.zip
# https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2017.zip

In [6]:
from functools import wraps
from time import time

def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        print('Function :%r with args:[%r, %r] took: %2.4f sec' % \
          (f.__name__, args, kw, te-ts))
        return result
    return wrap

@timing
def extract(year):
    print(f'Extractiing data for the {year} year.. ')
    resp = requests.get(f"https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-{year}.zip")
    data_dir = Path(f'./stack_data/{year}')
    data_dir.mkdir(exist_ok=True, parents=True)
    with open(data_dir / f'stack-overflow-developer-survey-{year}.zip', 'wb') as f:
        f.write(resp.content)
    return data_dir / f'stack-overflow-developer-survey-{year}.zip'

@timing
def transform(file_path: Path):
    print(f'Unzipping data for stackoverflow survey at {file_path}.. ')
    zip = ZipFile(file_path)
    for file in zip.infolist():
        if  Path(file.filename).suffix == '.csv':
            zip.extract(file.filename, path=file_path.parent)
            return Path(file_path.parent / file.filename )

def dq_checks(file_path):
    data = gx.read_csv(file_path)
    print(data)

# def load():

@timing
def orchestrate(start_year, end_year):
    print(f'Starting stackoverflow survey ETL from {start_year} till {end_year}..')
    for year in range(start_year, end_year + 1):
        file_path = extract(year)
        data_file_path = transform(file_path)
        print(data_file_path.resolve())
        # dq_checks(data_file_path.resolve())
        print('\n')

if __name__ == '__main__':
    orchestrate(2011, 2015)


Starting stackoverflow survey ETL from 2011 till 2015..
Extractiing data for the 2011 year.. 
Function :'extract' with args:[(2011,), {}] took: 0.0788 sec
Unzipping data for stackoverflow survey at stack_data/2011/stack-overflow-developer-survey-2011.zip.. 
Function :'transform' with args:[(PosixPath('stack_data/2011/stack-overflow-developer-survey-2011.zip'),), {}] took: 0.0044 sec
/workspaces/stack/airflow/data/stack_data/2011/2011 Stack Overflow Survey Results.csv


Extractiing data for the 2012 year.. 
Function :'extract' with args:[(2012,), {}] took: 0.0411 sec
Unzipping data for stackoverflow survey at stack_data/2012/stack-overflow-developer-survey-2012.zip.. 
Function :'transform' with args:[(PosixPath('stack_data/2012/stack-overflow-developer-survey-2012.zip'),), {}] took: 0.0121 sec
/workspaces/stack/airflow/data/stack_data/2012/2012 Stack Overflow Survey Results.csv


Extractiing data for the 2013 year.. 
Function :'extract' with args:[(2013,), {}] took: 0.0580 sec
Unzipping

In [None]:
gx.expectations.core.expect_table_columns_to_match_set()

In [23]:
def dq_checks(file_path):
    data = pd.read_csv(file_path, encoding='latin-1')
    print(data.columns.to_list())
    gx_data = gx.from_pandas(data)
    notes = {
        "notes": {
           "content": ["Based on our domain expertise, these columns should exist always" ],
           "format": "markdown",
           "source": "https://survey.stackoverflow.co/"
        }
    }

    result = gx_data.expect_table_columns_to_match_set(['What Country or Region do you live in?', 'Which US State or Territory do you live in?', 'How old are you?', 'How many years of IT/Programming experience do you have?', 'How would you best describe the industry you work in?', 'Which best describes the size of your company?', 'Which of the following best describes your occupation?', 'How likely is it that a recommendation you make will be acted upon?', 'What is your involvement in purchasing? You can choose more than 1.', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'What types of purchases are you involved in?', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'What is your budget for outside expenditures (hardware, software, consulting, etc) for 2011?', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'What type of project are you developing?', 'Which languages are you proficient in?', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'What operating system do you use the most?', 'Please rate your job/career satisfaction', 'Including bonus, what is your annual compensation in USD?', 'Which technology products do you own? (You can choose more than one)', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'In the last 12 months, how much money have you spent on personal technology-related purchases? ', 'Which of our sites do you frequent most?'], meta=notes)
    print(result)
    expectation_suite = gx_data.get_expectation_suite(discard_failed_expectations=False)
    import json
    with open("stackoverflow_survey_data_expectations_suite.json", "w") as my_file:
        my_file.write(
            json.dumps(expectation_suite.to_json_dict(), sort_keys=True, indent=4)
        )

dq_checks('/workspaces/stack/airflow/data/stack_data/2011/2011 Stack Overflow Survey Results.csv')

['What Country or Region do you live in?', 'Which US State or Territory do you live in?', 'How old are you?', 'How many years of IT/Programming experience do you have?', 'How would you best describe the industry you work in?', 'Which best describes the size of your company?', 'Which of the following best describes your occupation?', 'How likely is it that a recommendation you make will be acted upon?', 'What is your involvement in purchasing? You can choose more than 1.', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'What types of purchases are you involved in?', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'What is your budget for outside expenditures (hardware, software, consulting, etc) for 2011?', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'What type of project are you developing?', 'Which languages are you proficient in?', 'Unnamed: 31', 'Unnamed:

In [51]:
context = gx.get_context()
validator = context.sources.pandas_default.read_csv('/workspaces/stack/airflow/data/stack_data/2011/2011 Stack Overflow Survey Results.csv', encoding='latin-1')
notes = {
        "notes": {
           "content": ["Based on our domain expertise, these columns should exist always" ],
           "format": "markdown",
           "source": "https://survey.stackoverflow.co/"
        }
    }

result = validator.expect_table_columns_to_match_set(['What Country or Region do you live in?', 'Which US State or Territory do you live in?', 'How old are you?', 'How many years of IT/Programming experience do you have?', 'How would you best describe the industry you work in?', 'Which best describes the size of your company?', 'Which of the following best describes your occupation?', 'How likely is it that a recommendation you make will be acted upon?', 'What is your involvement in purchasing? You can choose more than 1.', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'What types of purchases are you involved in?', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'What is your budget for outside expenditures (hardware, software, consulting, etc) for 2011?', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'What type of project are you developing?', 'Which languages are you proficient in?', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'What operating system do you use the most?', 'Please rate your job/career satisfaction', 'Including bonus, what is your annual compensation in USD?', 'Which technology products do you own? (You can choose more than one)', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'In the last 12 months, how much money have you spent on personal technology-related purchases? ', 'Which of our sites do you frequent most?'], meta=notes)
validator.save_expectation_suite(filepath='gx_stackoverflow_survey_data_expectations_suite.json', discard_failed_expectations=False)
# print(result)
context.build_data_docs(resource_identifiers=[result])

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{'local_site': 'file:///tmp/tmp_gqehyx8/index.html'}

In [26]:
data = pd.read_csv('/workspaces/stack/airflow/data/stack_data/2015/2015 Stack Overflow Developer Survey Responses.csv', encoding='latin-1')
print(data.columns.to_list())
gx_data = gx.from_pandas(data)
gx_data.validate(expectation_suite='stackoverflow_survey_data_expectations_suite.json')

['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Select all that apply', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Select all that apply.1', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 6

{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "expectation_type": "expect_table_columns_to_match_set",
        "kwargs": {
          "column_set": [
            "What Country or Region do you live in?",
            "Which US State or Territory do you live in?",
            "How old are you?",
            "How many years of IT/Programming experience do you have?",
            "How would you best describe the industry you work in?",
            "Which best describes the size of your company?",
            "Which of the following best describes your occupation?",
            "How likely is it that a recommendation you make will be acted upon?",
            "What is your involvement in purchasing? You can choose more than 1.",
            "Unnamed: 9",
            "Unnamed: 10",
            "Unnamed: 11",
            "Unnamed: 12",
            "Unnamed: 13",
            "Unnamed: 14",
            "What types of purchases are you in

In [53]:
!cat /tmp/tmp_gqehyx8/index.html

<!DOCTYPE html>
<html>
  <head>
    <title>Data Docs created by Great Expectations</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta charset="UTF-8">
    <title></title>

    
    
    <link rel="stylesheet" href="https://unpkg.com/bootstrap-table@1.19.1/dist/bootstrap-table.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"/>

    <link rel="stylesheet" type="text/css" href="https://unpkg.com/bootstrap-table@1.19.0/dist/extensions/filter-control/bootstrap-table-filter-control.min.css">
    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap-datepicker/1.9.0/css/bootstrap-datepicker.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@forevolve/bootstrap-dark@1.1.0/dist/css/bootstrap-prefers-dark.css" />

    <style>
  

body {
  position: relative;
}

.container {
  padding-top: 50px;
}

.sticky {
  position: -webkit-