# Project - Hive

Analysis of large datasets is being performed at
an unprecedented frequency. Several technologies have been
developed to do so, offering a variety of solutions and drawbacks
related to the processing of different data types and
data processing requirements. 

This notebook implements Hive in order to solve a series of questions by using a data set regarding air polution in the USA.
In the report, we compared the performance
of five different technologies – MapReduce, Spark RDD,
SparkDF, Spark SQL and Hive.

Epa table

In [None]:
%%file epa.hql

drop table summary;

CREATE TABLE summary(
state_code             int,
county_code            int,
site_num               int,
parameter_code         int,
poc                    int,
latitude               float,
longitude              float,
datum                  CHAR(28),
parameter_name         CHAR(28),
sample_duration        CHAR(28),
pollutant_standard     float,
date_local             CHAR(28),
units_of_measure       CHAR(28),
event_type             CHAR(28),
observation_count      int,
observation_percent    float,
arithmetic_mean        float,
first_max_value        float,
first_max_hour         int,
aqi                    float,
method_code            int,
method_name            CHAR(28),
local_site_name        CHAR(28),
address                CHAR(48),
state_name             CHAR(28),
county_name            CHAR(28),
city_name              CHAR(28),
cbsa_name              CHAR(28),
date_of_last_change    CHAR(28) )

 ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

load data local inpath 'epa_hap_daily_summary-small.csv' into table summary;

States table

In [None]:
%%file states.hql

drop table states;

CREATE TABLE states(
	state                char(2),
	name                 char(28),
	minlat               float,
	maxlat               float,
	minlon               float,
	maxlon               float )

 ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

load data local inpath 'usa_states.csv' into table states;


In [10]:
%%file q1.hql

SELECT state_code, collect_set(state_name) AS StateName, count(DISTINCT latitude+longitude) AS NumberOfMonitors 
FROM summary 
GROUP BY state_code 
ORDER BY StateName ASC

Overwriting q1.hql


In [11]:
%%file q2.hql

SELECT county_name, sum(arithmetic_mean)/count(arithmetic_mean) AS Value 
FROM summary
GROUP BY county_name 
ORDER BY Value DESC

Overwriting q2.hql


In [12]:
%%file q3.hql

SELECT state_code, collect_set(state_name) AS state_name, sum(arithmetic_mean)/count(arithmetic_mean) AS value, substr(date_local, 1, 4) AS year 
FROM summary 
GROUP BY state_code, date_local 
ORDER BY value DESC 
LIMIT 100

Overwriting q3.hql


In [14]:
%%file q4.hql

drop table aux;

CREATE TABLE aux AS
SELECT state_name, address, minlat, latitude, maxlat, minlon, longitude, maxlon,
ABS(latitude-(minLat+(maxLat-minLat)/2)) AS dlat, ABS(longitude-(minLon+(maxLon-minLon)/2)) AS dlon 
FROM summary l, states s 
WHERE l.state_name=s.name;

SELECT state_name, SQRT(POW(AVG(t.dlat)*111, 2)+POW(AVG(t.dlon)*111, 2)) AS hkm FROM aux t GROUP BY state_name;

Overwriting q4.hql


In [25]:
%%file q5.hql

drop table statesaux;
drop table ne;
drop table nw;
drop table se;
drop table sw;



CREATE TABLE statesaux AS
SELECT DISTINCT state_code, state_name, minLat, minLat+ABS((maxLat-minLat)/2) AS centerLat, maxLat, minLon, minLon+((maxLon-minLon)/2) AS centerLon, maxLon AS total 
FROM summary, states 
WHERE summary.state_name=states.name  
ORDER BY state_code;

CREATE TABLE ne AS
SELECT l.state_name, COUNT(DISTINCT latitude+longitude) AS value FROM summary l, statesaux s WHERE l.state_name=s.state_name AND l.latitude>s.centerLat AND l.longitude>s.centerLon GROUP BY l.state_name;

CREATE TABLE nw AS
SELECT l.state_name, COUNT(DISTINCT latitude+longitude) AS value FROM summary l, statesaux s WHERE l.state_name=s.state_name AND l.latitude>s.centerLat AND l.longitude<s.centerLon GROUP BY l.state_name;

CREATE TABLE se AS
SELECT l.state_name, COUNT(DISTINCT latitude+longitude) AS value FROM summary l, statesaux s WHERE  l.state_name=s.state_name AND l.latitude<s.centerLat AND l.longitude>s.centerLon GROUP BY l.state_name;

CREATE TABLE sw AS
SELECT l.state_name, COUNT(DISTINCT latitude+longitude) AS value FROM summary l, statesaux s WHERE  l.state_name=s.state_name AND l.latitude<s.centerLat AND l.longitude<s.centerLon GROUP BY l.state_name;

SELECT  l.state_code, l.state_name, nw.value AS NW, ne.value AS NE, sw.value AS SW, se.value AS SE 
FROM statesaux s, summary l, states u, ne, nw, se, sw 
WHERE l.state_name=ne.state_name AND l.state_name=nw.state_name AND l.state_name=se.state_name AND l.state_name=sw.state_name AND s.state_code=l.state_code 
GROUP BY l.state_code, l.state_name, nw.value, ne.value, sw.value, se.value, s.total
ORDER BY state_name ASC;



Overwriting q5.hql
