My Solution to Exercises
- Deduplicate
select * from
table t1
join
(
select id, max(date_last_modified) as max_date
table
group by id
) t2
on (t1.id = t2.id and t1.date_last_modified = t2.max_date)
- Rank
select * from
(select *, rank() over (partition by id order by date_last_modified desc) as rank from table) g
where rank = 1;
select location_id, category_id, time, seller_id, category_revenue, item_cnt, revenue_share, item_share, rk
from
(select *, rank() over (PARTITION BY item_id order by market_share desc, revenue_share desc ) as rk
from item_seller_share
where location_id = ${location_id} and item_share >${item_share}
) A
WHERE rk <=${top_n};
- Regular Expressions
regexp_extract(var, 'patter') as var
regexp_extract(page_url,'.*/item/([0-9\\.]+)/',1) as item_id
regexp_extract(column, '\\;Price=([0-9\\.]+)',1) as Price
substr(TAG,1,7) = 'RECO_EXP'
- Time Zone
- Set Variable
set hivevar:this_year = 2018;
select sum(case when year(year)=${this_year} then spend else 0.0 end) as spend_this_year
set date = 20180826;
select * from table
where dt > ${hiveconf:date};
-- With
with q1 as (
),
q2 as (
)
select
from q2
- Create a Histogram
- Calculate a Joint Distribution
- Calculate Percentiles