In [1]:
import geopandas as gpd
import pandas as pd
import boto3
from sqlalchemy import create_engine, text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from config import settings

In [2]:
BUCKET_NAME = 'ds-interview-sandbox'
client = boto3.client('s3')
s3 = boto3.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)

In [3]:
gdf = gpd.read_file('./uga_regions.geojson')

In [4]:
gdf.columns = [ x.lower() for x in gdf.columns]

In [5]:
gdf

Unnamed: 0,name,iso_code,iso2_code,area_type,geometry
0,Eastern,UG-E,UGA,Regions,"MULTIPOLYGON (((33.51939 -1.00000, 33.52066 -0..."
1,Western,UG-W,UGA,Regions,"MULTIPOLYGON (((29.65173 -0.64149, 29.65195 -0..."
2,Central,UG-C,UGA,Regions,"MULTIPOLYGON (((32.97509 0.90589, 32.97594 0.9..."
3,Northern,UG-N,UGA,Regions,"MULTIPOLYGON (((32.19980 3.50716, 32.20084 3.5..."


In [6]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('ds-interview-sandbox')
# Iterates through all the objects, doing the pagination for you. Each obj
# is an ObjectSummary, so it doesn't contain the body. You'll need to call
# get to get the whole body.
for obj in bucket.objects.filter(Prefix='shared/ug'):
    key = obj.key
    print(key)

shared/ug_clusters.csv
shared/uga_dhs_2016.csv
shared/uga_regions.geojson


In [7]:
for obj in bucket.objects.filter(Prefix='shared/ug'):
    key = obj.key
    s3.Bucket(BUCKET_NAME).download_file(key, f"./{key.split('/')[-1]}")

(696, 4)
(23842, 10)
(4, 5)


In [8]:
DATABASE_URL = f'postgresql+psycopg2://{settings.database_username}:{settings.database_password}@{settings.database_host}:{settings.database_port}/{settings.database_name}'

engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)


In [9]:

def load_db(path:str, table:str, conn = engine):
    if path.split(".")[-1] == "geojson":
        gdf = gpd.read_file(path)
        gdf.columns = [x.lower() for x in gdf.columns]
        gdf.to_postgis(table, conn)

    else:
        df = pd.read_csv(path)
        df.columns = [x.lower() for x in df.columns]
        df.to_sql(table, conn)

In [10]:
#load_db("./ug_clusters.csv", 'ug_clusters')

In [11]:
"""SELECT a.v001, a.v751, a.sex, b.LATNUM, b.LONGNUM

FROM ug_dhs a
INNER JOIN ug_clusters b
ON a.v001 = b.v001"""

'SELECT a.v001, a.v751, a.sex, b.LATNUM, b.LONGNUM\n\nFROM ug_dhs a\nINNER JOIN ug_clusters b\nON a.v001 = b.v001'

In [12]:
query = text("""SELECT ug_dhs.v751, ug_dhs.v001, ug_clusters.latnum, ug_clusters.longnum, ug_dhs.sex, ug_regions.geometry  
FROM ug_dhs 
JOIN ug_clusters ON ug_dhs.v001 = ug_clusters.v001
JOIN ug_regions ON ug_regions.name = 'Northern'
WHERE ug_dhs.sex = 'male'""")

In [13]:
""""
CASE
    WHEN condition1 THEN result1
    WHEN condition2 THEN result2
    WHEN conditionN THEN resultN
    ELSE result
END;
"""

'"\nCASE\n    WHEN condition1 THEN result1\n    WHEN condition2 THEN result2\n    WHEN conditionN THEN resultN\n    ELSE result\nEND;\n'

In [15]:
engine.execute(query).fetchone()