In [4]:
import pandas as pd
import great_expectations as gx
from zipfile import ZipFile
import requests
from pathlib import Path

In [2]:
!curl -Ls "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2022.zip" -o stack-overflow-developer-survey-2022.zip


# https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2017.zip

^C


In [47]:

from functools import wraps
from time import time

def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        print('Function :%r with args:[%r, %r] took: %2.4f sec' % \
          (f.__name__, args, kw, te-ts))
        return result
    return wrap

@timing
def extract(year):
    print(f'Extractiing data for the {year} year.. ')
    resp = requests.get(f"https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-{year}.zip")
    data_dir = Path(f'./stack_data/{year}')
    data_dir.mkdir(exist_ok=True, parents=True)
    with open(data_dir / f'stack-overflow-developer-survey-{year}.zip', 'wb') as f:
        f.write(resp.content)
    return data_dir / f'stack-overflow-developer-survey-{year}.zip'

@timing
def transform(file_path: Path):
    print(f'Unzipping data for stackoverflow survey at {file_path}.. ')
    zip = ZipFile(file_path)
    for file in zip.infolist():
        if  Path(file.filename).suffix == '.csv':
            zip.extract(file.filename, path=file_path.parent)
# def dq_checks():
# def load():

@timing
def orchestrate(start_year, end_year):
    print(f'Starting stackoverflow survey ETL from {start_year} till {end_year}..')
    for year in range(start_year, end_year + 1):
        file_path = extract(year)
        transform(file_path)
        print('\n')

if __name__ == '__main__':
    orchestrate(2011, 2015)

Starting stackoverflow survey ETL from 2011 till 2015..
Extractiing data for the 2011 year.. 
Function :'extract' with args:[(2011,), {}] took: 0.0311 sec
Unzipping data for stackoverflow survey at stack_data/2011/stack-overflow-developer-survey-2011.zip.. 
Function :'transform' with args:[(PosixPath('stack_data/2011/stack-overflow-developer-survey-2011.zip'),), {}] took: 0.0066 sec


Extractiing data for the 2012 year.. 
Function :'extract' with args:[(2012,), {}] took: 0.0441 sec
Unzipping data for stackoverflow survey at stack_data/2012/stack-overflow-developer-survey-2012.zip.. 
Function :'transform' with args:[(PosixPath('stack_data/2012/stack-overflow-developer-survey-2012.zip'),), {}] took: 0.0131 sec


Extractiing data for the 2013 year.. 
Function :'extract' with args:[(2013,), {}] took: 0.0578 sec
Unzipping data for stackoverflow survey at stack_data/2013/stack-overflow-developer-survey-2013.zip.. 
Function :'transform' with args:[(PosixPath('stack_data/2013/stack-overflow-de