# Project - Spark

Analysis of large datasets is being performed at
an unprecedented frequency. Several technologies have been
developed to do so, offering a variety of solutions and drawbacks
related to the processing of different data types and
data processing requirements. 

This notebook implements Spark in order to solve a series of questions by using a data set regarding air polution in the USA.
In the report, we compared the performance
of five different technologies – MapReduce, Spark RDD,
SparkDF, Spark SQL and Hive.

# Q1:  Which states have more/less monitors? (Rank states!)

In [1]:
#Q1

import pyspark 
from operator import add as sum


sc = pyspark.SparkContext('local[*]')

try :
    rdd = sc.textFile(('./epa_hap_daily_summary-small.csv'))
    header = rdd.first()
    no_header = rdd.filter(lambda line: line != header)
    non_empty = no_header.filter( lambda line : len(line) > 0 )
    words = non_empty.map( lambda line : line.split(','))
    state2 = words.map(lambda line: (((line[5], line [6]) , line[24]))) #5 lat 6 long
    state3 = state2.reduceByKey(lambda a, b: a) #se lat_long =, então manter so 1x o state
    state4 = state3.map(lambda line: (line[1], 1)) #state, 1
    state5 = state4.reduceByKey(sum)
    state_sorted = state5.sortBy(lambda a: a[1], False)
    for k,v in state_sorted.collect():
        print(k, v)

except Exception as e:
    print(e)
sc.stop()

California 170
Texas 133
Minnesota 94
Michigan 92
Ohio 91
New York 67
South Carolina 64
Montana 62
Pennsylvania 61
Florida 55
Indiana 52
Colorado 51
North Carolina 50
Illinois 48
Washington 43
Louisiana 41
Arizona 38
Kansas 37
Georgia 35
Kentucky 34
Oregon 32
Alabama 31
Tennessee 29
Wisconsin 26
New Jersey 24
Vermont 22
Oklahoma 22
Mississippi 21
Maine 21
Massachusetts 19
Virginia 19
Missouri 18
Iowa 18
New Mexico 18
Country Of Mexico 18
New Hampshire 17
Maryland 17
Idaho 17
Connecticut 15
Rhode Island 13
Utah 12
Alaska 12
Arkansas 11
West Virginia 10
Nevada 9
Wyoming 9
North Dakota 7
South Dakota 7
Nebraska 6
Delaware 6
Puerto Rico 6
Virgin Islands 6
District Of Columbia 5
Hawaii 5


# Q2: Which counties have the best/worst air quality? (Rank counties considering pollutants’ level!)

In [3]:
#Q2
import pyspark 
from operator import add as sum

sc = pyspark.SparkContext('local[*]')

try :
    rdd = sc.textFile(('./epa_hap_daily_summary-small.csv'))
    header = rdd.first()
    no_header = rdd.filter(lambda line: line != header)
    not_empty = no_header.filter( lambda line : len(line) > 0 )
    words = not_empty.map( lambda line : line.split(','))
    counties = words.map(lambda line : (line[25], line[16])) # line[16] arithmetic mean [23] county name
    counties_mean = counties.mapValues(lambda v: (v, 1)).reduceByKey(lambda a,b: (float(a[0])+float(b[0]), a[1]+b[1])).mapValues(lambda v: float(v[0])/float(v[1]))
    counties_sorted = counties_mean.sortBy(lambda a: a[1], False)

    for k, v in counties_sorted.collect():
        print(k,v)
    
except Exception as e:
    print(e)
sc.stop() 

Tipton 2556.0
Nassau 19.0
Columbiana 7.385690735785953
CHIHUAHUA STATE 4.5121875
Caldwell 4.116666666666667
Madera 3.7393
Oakland 2.888877848101266
Duval 2.7794603978494625
Kearny 2.3753333333333333
Bucks 2.3674999999999997
San Luis Obispo 2.3333333333333335
Edgecombe 2.325
Pawnee 2.2941176470588234
Westchester 2.239375
Johnston 2.225
Hartford 2.0787055896226416
Granville 2.0285714285714285
Asotin 2.025
Duplin 2.0
Boulder 1.960470901234568
Crittenden 1.9000000000000001
Yancey 1.9
Los Angeles 1.8391350859879247
Iberville 1.8213132266666665
Caswell 1.80075
Pitt 1.7999999999999998
Clinton 1.7583018867924527
Wayne 1.7137964212983239
Imperial 1.6478600515463917
Ozaukee 1.55733332
Stillwater 1.5384615384615385
Crow Wing 1.532046511627907
Boyd 1.485702254901961
Gloucester 1.4814285714285715
Henderson 1.4295509090909093
Kennebec 1.375
Stanislaus 1.2781941269841275
Muscatine 1.2375
Deer Lodge 1.2135793388429752
Mesa 1.2109524924242425
Spokane 1.1469733510638298
East Baton Rouge 1.14447284566596

Citrus 0.000773837638376384
St John 0.0007534323432343237
Socorro 0.0007447961630695445
Cooper 0.0007177586206896551
Sandusky 0.0007133333333333333
Garrett 0.0007109401709401713
Tucker 0.0007039563437926332
Yakima 0.000703049645390071
Bennington 0.000701100569259962
Charlton 0.0006983966244725742
Deschutes 0.0006914285714285715
Routt 0.0006742857142857143
Blount 0.0006720827586206897
Thurston 0.0006664912280701753
Klickitat 0.0006641159135559924
Cedar 0.0006619471153846157
Culberson 0.000659703808180536
Winn 0.000658964143426295
Mineral 0.0006558108108108109
Newton 0.0006340735294117648
Adair 0.0006306818181818183
St. Joseph 0.000625
Mohave 0.0006216441441441443
Taney 0.0006190759753593432
Litchfield 0.0006181384248210027
Jeff Davis 0.0006178873239436621
Vilas 0.0006
Dona Ana 0.0005941818181818182
Avery 0.0005873626373626376
Laurens 0.0005806451612903226
Grand 0.0005775
Chaves 0.0005646680942184157
Brewster 0.0005631002331002332
Schoolcraft 0.0005594671403197158
Coconino 0.000553397947

# Q.3 Which states have the best/worst air quality in each year? (Rank states per year considering pollutants’ level!)

In [51]:
#Q3
import pyspark 
from operator import add as sum

sc = pyspark.SparkContext('local[*]')

try :
    rdd = sc.textFile(('./epa_hap_daily_summary-small.csv'))
    header = rdd.first()
    no_header = rdd.filter(lambda line: line != header)
    non_empty = no_header.filter( lambda line : len(line) > 0 )
    words = non_empty.map(lambda line : line.split(','))
    state_year_pollut = words.map(lambda line : ((line[24], line[11][:4]), line[16])) #24 state, 11[:4] year, 16 arithmetic mean #DONE
    sy_mean = state_year_pollut.mapValues(lambda v: (v, 1)).reduceByKey(lambda a,b: (float(a[0])+float(b[0]), a[1]+b[1])).mapValues(lambda v: float(v[0])/float(v[1]))
    sy_sorted = sy_mean.sortBy(lambda a: a[1], False) #false descending
    
    for k, v in sy_sorted.collect():
        print(k, v)

except Exception as e:
    print(e)
sc.stop() 

('Tennessee', '1990') 170.40093066666665
('Country Of Mexico', '1995') 8.46
('Michigan', '2001') 4.506138716367713
('Massachusetts', '1993') 4.305833285714285
('Colorado', '2017') 4.2250000000000005
('Indiana', '1990') 4.098978378378379
('Illinois', '1992') 3.911825163398692
('Massachusetts', '1994') 3.4609906122448977
('Louisiana', '1995') 3.364348865853659
('Rhode Island', '1994') 3.3635714000000005
('Alabama', '1996') 3.226314057971015
('Connecticut', '1993') 3.0975461538461535
('Massachusetts', '1990') 3.0246823529411766
('Wisconsin', '1994') 2.9504833333333336
('Indiana', '1993') 2.8972258064516128
('Rhode Island', '1995') 2.7313043478260868
('Delaware', '1993') 2.723077
('Indiana', '1992') 2.6606363636363635
('Pennsylvania', '1993') 2.5750862068965517
('District Of Columbia', '1995') 2.5047463333333333
('Wisconsin', '1995') 2.5022444333333334
('Wisconsin', '1998') 2.4918918918918918
('Connecticut', '1998') 2.3851474516129034
('Country Of Mexico', '1993') 2.38
('Connecticut', '199

('Connecticut', '2010') 0.37237493220338985
('Minnesota', '1992') 0.3721225333333333
('Iowa', '2012') 0.3720322357723577
('Rhode Island', '2005') 0.371447238888889
('Massachusetts', '2006') 0.37096272079772075
('West Virginia', '2015') 0.3702613065326634
('Virginia', '2002') 0.36980936303630374
('Massachusetts', '2005') 0.3681584166666667
('Colorado', '2014') 0.3660351738241309
('Mississippi', '2006') 0.36599312499999986
('Illinois', '2012') 0.36544118181818186
('Florida', '2009') 0.3650031369150779
('Florida', '2000') 0.36245085093167706
('Pennsylvania', '2001') 0.3617021698717948
('Kentucky', '2005') 0.36043251724137937
('Arkansas', '1999') 0.36016020833333334
('Utah', '2015') 0.35925959999999996
('Wisconsin', '2015') 0.35923666666666665
('Maryland', '1992') 0.3572955974842767
('Michigan', '1996') 0.35712706788511756
('Oregon', '2007') 0.35673638709677424
('Louisiana', '2006') 0.3556575491679274
('Maine', '1997') 0.35528495833333334
('New Jersey', '2004') 0.3550960498614959
('Utah', 

('Vermont', '2016') 0.20780864197530863
('Maryland', '2005') 0.20677294444444444
('Pennsylvania', '2010') 0.20589755064935072
('Minnesota', '2005') 0.20587308781869695
('Ohio', '2008') 0.20545059308072489
('Tennessee', '2004') 0.2054458302583026
('Utah', '2005') 0.20484141791044774
('Illinois', '2006') 0.20429677861163217
('Virginia', '2004') 0.20417014613180512
('Tennessee', '2008') 0.2031198245614035
('Utah', '2001') 0.2017917021276596
('Ohio', '2014') 0.20172593220338986
('New York', '2013') 0.20153687310344834
('South Carolina', '2000') 0.20153597260273978
('Ohio', '2011') 0.20050315548780487
('Illinois', '2009') 0.20044349173553716
('Delaware', '2006') 0.20009721804511277
('West Virginia', '2014') 0.1999353233830846
('West Virginia', '2012') 0.19949305555555552
('South Carolina', '2004') 0.19915309716599197
('Washington', '2001') 0.19886409090909096
('Ohio', '1995') 0.19825714285714285
('Georgia', '2005') 0.1980022181122449
('Utah', '2006') 0.19790330612244897
('Iowa', '2003') 0.1

('Country Of Mexico', '2016') 0.0144375
('North Carolina', '1990') 0.0143
('Vermont', '2017') 0.01333333333333333
('Montana', '1997') 0.012918898305084747
('Alabama', '2004') 0.012889447513812153
('Montana', '1999') 0.011428888888888891
('North Dakota', '2003') 0.011235400000000001
('West Virginia', '2001') 0.0107925
('Mississippi', '2009') 0.010703125
('Alabama', '2016') 0.010492307692307693
('Puerto Rico', '1990') 0.01005
('Tennessee', '1992') 0.009468055555555556
('Arkansas', '2003') 0.009399148936170214
('Georgia', '1990') 0.008366666666666666
('Alabama', '2008') 0.008139459459459458
('Connecticut', '1990') 0.0081
('Alabama', '2015') 0.007728571428571429
('Montana', '2000') 0.0072489325842696635
('Colorado', '1992') 0.00671141935483871
('New Mexico', '2015') 0.006607999999999999
('Michigan', '1990') 0.006559896373056996
('South Carolina', '1993') 0.005991228070175439
('Tennessee', '1997') 0.005985555555555556
('Missouri', '1990') 0.0056
('Kansas', '2009') 0.00555175
('Colorado', '1

('South Dakota', '2015') 0.00047474747474747476
('South Dakota', '1999') 0.0004745454545454546
('Virgin Islands', '1995') 0.0004714285714285714
('Arkansas', '1992') 0.0004702272727272727
('Idaho', '2013') 0.0004691358024691359
('Wyoming', '1992') 0.0004655813953488372
('New Mexico', '1998') 0.0004642666666666666
('Wyoming', '2012') 0.0004619883040935673
('South Dakota', '2013') 0.0004615384615384616
('Alaska', '2007') 0.0004602941176470588
('Arkansas', '2016') 0.00045652173913043474
('Idaho', '1996') 0.00045601941747572814
('New Mexico', '2010') 0.0004478260869565218
('New Mexico', '1996') 0.0004460344827586207
('Montana', '2007') 0.00044281250000000017
('Hawaii', '1995') 0.0004421874999999999
('Alaska', '1990') 0.00044208333333333334
('Idaho', '1994') 0.00043571428571428575
('Utah', '1994') 0.00043070422535211264
('Arkansas', '1993') 0.00042833333333333335
('Nebraska', '2014') 0.0004246575342465754
('Wyoming', '2015') 0.00042105263157894745
('Nevada', '1990') 0.00042080000000000004
('

# Q.4 For each state, what is the average distance of the monitors to the state center? 

In [8]:
#Q4
import pyspark 
from operator import add as sum

coordinates = {}
first_line = True

with open('usa_states.csv','r') as coord:
    for line in coord:
        if first_line:
            first_line = False
        else:
            line = line.strip()
            line = line.split(',')
            media_lat = (float(line[2]) + float(line[3])) / 2
            media_long = (float(line[4]) + float(line[5])) / 2
            coordinates[line[1]] = [media_lat, media_long]

            
sc = pyspark.SparkContext('local[*]')


try :
    rdd = sc.textFile(('./epa_hap_daily_summary-small.csv'))
    header = rdd.first()
    no_header = rdd.filter(lambda line: line != header)
    non_empty = no_header.filter( lambda line : len(line) > 0 )
    words = non_empty.map( lambda line : line.split(','))
    state_coords = words.map(lambda line: (((line[5], line [6]) , line[24]))) #5 lat 6 long 24 state
    unique_machines = state_coords.reduceByKey(lambda a, b: a) #se lat_long =, então manter so 1x o state
    state_coords_unique = unique_machines.map(lambda line: (line[1], line[0])) #state ("lat", "long")
    state_in_dict = state_coords_unique.filter(lambda line: line[0] in coordinates) #TODO: MAIS RAPIDO ACIMA
    state_dists = state_in_dict.map(lambda line: (line[0], ((float(line[1][0]) - float(coordinates[line[0]][0]) , float(line[1][1]) - float(coordinates[line[0]][1])), 1  )))
    sy_counts = state_dists.reduceByKey(lambda a, b: [ [ abs(a[0][0]) + abs(b[0][0]) ,  abs(a[0][1]) + abs(b[0][1]) ], a[1] + b[1]]) #retornar o mesmo tipo
    sy_kms = sy_counts.map(lambda line: (line[0], float(line[1][0][0])/float(line[1][1])*111, float(line[1][0][1])/float(line[1][1])*111  ))
    sy_pitagoras = sy_kms.map(lambda line: (line[0], ( line[1]**2 + line[2]**2)**(1/2)   ) )
    sy_sorted = sy_pitagoras.sortBy(lambda a: a[0])
    
    for line in sy_sorted.collect():
        print(line)
    


except Exception as e:
    print(e)
sc.stop()

('Alabama', 154.49260106651096)
('Alaska', 578.2547257424047)
('Arizona', 164.10851532123158)
('Arkansas', 139.88499280806286)
('California', 295.95600977387613)
('Colorado', 162.79022014593346)
('Connecticut', 48.010830374129405)
('Delaware', 49.430957208141784)
('Florida', 324.59197725295155)
('Georgia', 171.86252921573762)
('Hawaii', 147.5039098705215)
('Idaho', 278.0578748063309)
('Illinois', 221.7342238643998)
('Indiana', 171.46564194366175)
('Iowa', 199.63252527145843)
('Kansas', 285.9355026901175)
('Kentucky', 212.76331231552336)
('Louisiana', 169.43912820444015)
('Maine', 161.9282055931588)
('Maryland', 88.90940287294285)
('Massachusetts', 88.93366950483129)
('Michigan', 308.29023891412436)
('Minnesota', 175.28805993470408)
('Mississippi', 166.05266630137885)
('Missouri', 225.74263745981838)
('Montana', 270.8068208508409)
('Nebraska', 305.52929788978076)
('Nevada', 310.21511694598934)
('New Hampshire', 109.73569277263323)
('New Jersey', 78.76283305562232)
('New Mexico', 172.101

# Q.5 How many sensors per quadrant in each state?

In [2]:
#Q5
import pyspark 
from operator import add as sum

coordinates = {}
first_line = True

with open('usa_states.csv','r') as coord:
    for line in coord:
        if first_line:
            first_line = False
        else:
            line = line.strip()
            line = line.split(',')
            media_lat = (float(line[2]) + float(line[3])) / 2
            media_long = (float(line[4]) + float(line[5])) / 2
            coordinates[line[1]] = [media_lat, media_long]
            coordinates[line[1]] = [line[2], line[3], media_lat, line[4], line[5], media_long] #MinLat,MaxLat,media_lat,MinLon,MaxLon,media_long

            
sc = pyspark.SparkContext('local[*]')

try :
    rdd = sc.textFile(('./epa_hap_daily_summary-small.csv'))
    header = rdd.first()
    no_header = rdd.filter(lambda line: line != header)
    non_empty = no_header.filter( lambda line : len(line) > 0 )
    words = non_empty.map( lambda line : line.split(','))
    state_coords = words.map(lambda line: (((line[5], line [6]) , line[24]))) #5 lat 6 long 24 state
    unique_machines = state_coords.reduceByKey(lambda a, b: a) #se lat_long =, então manter so 1x o state
    state_coords_unique = unique_machines.map(lambda line: (line[1], line[0])) #state ("lat", "long")
    state_coords_in_dict = state_coords_unique.filter(lambda line: line[0] in coordinates) 
    
    #latitude 
    quad_lat1_long1 = state_coords_in_dict.filter(lambda line: float(coordinates[line[0]][0]) < float(line[1][0]) < float(coordinates[line[0]][2]) and float(coordinates[line[0]][3]) < float(line[1][1]) < float(coordinates[line[0]][5]))
    quad_lat11_named = quad_lat1_long1.map(lambda line : ((line[0] , "SW"), 1 )) #TODO mudar lat long para value 1

    
    quad_lat2_long1 = state_coords_in_dict.filter(lambda line: float(coordinates[line[0]][2]) < float(line[1][0]) < float(coordinates[line[0]][1]) and float(coordinates[line[0]][3]) < float(line[1][1]) < float(coordinates[line[0]][5]))
    quad_lat21_named = quad_lat2_long1.map(lambda line : ( (line[0] , "NW" ), 1))

    
    quad_lat1_long2 = state_coords_in_dict.filter(lambda line: float(coordinates[line[0]][0]) < float(line[1][0]) < float(coordinates[line[0]][2]) and float(coordinates[line[0]][5]) < float(line[1][1]) < float(coordinates[line[0]][4]))
    quad_lat12_named = quad_lat1_long2.map(lambda line : ((line[0] , "SE"), 1)) #TODO mudar lat long para value 1

    
    quad_lat2_long2 = state_coords_in_dict.filter(lambda line: float(coordinates[line[0]][2]) < float(line[1][0]) < float(coordinates[line[0]][1]) and float(coordinates[line[0]][5]) < float(line[1][1]) < float(coordinates[line[0]][4]))
    quad_lat22_named = quad_lat2_long2.map(lambda line : ((line[0] , "NE" ), 1))

    #all_quads = quad11_sum.union(quad21_sum).union(quad12_sum).union(quad22_sum)
    all_quads = quad_lat11_named.union(quad_lat12_named).union(quad_lat21_named).union(quad_lat22_named)
    all_quads_sum = all_quads.reduceByKey(sum)
    all_quads_ord = all_quads_sum.sortBy(lambda a: a[0])
        
    for line in all_quads_ord.collect():
        print(line)
    
    
    
except Exception as e:
    print(e)
sc.stop() 

(('Alabama', 'NE'), 5)
(('Alabama', 'NW'), 14)
(('Alabama', 'SE'), 5)
(('Alabama', 'SW'), 7)
(('Alaska', 'NE'), 4)
(('Alaska', 'NW'), 3)
(('Alaska', 'SE'), 2)
(('Alaska', 'SW'), 3)
(('Arizona', 'NE'), 2)
(('Arizona', 'NW'), 10)
(('Arizona', 'SE'), 16)
(('Arizona', 'SW'), 10)
(('Arkansas', 'NE'), 2)
(('Arkansas', 'NW'), 3)
(('Arkansas', 'SE'), 1)
(('Arkansas', 'SW'), 5)
(('California', 'NE'), 2)
(('California', 'NW'), 84)
(('California', 'SE'), 68)
(('California', 'SW'), 16)
(('Colorado', 'NE'), 25)
(('Colorado', 'NW'), 17)
(('Colorado', 'SE'), 4)
(('Colorado', 'SW'), 5)
(('Connecticut', 'NE'), 5)
(('Connecticut', 'NW'), 2)
(('Connecticut', 'SW'), 8)
(('Delaware', 'NW'), 4)
(('Delaware', 'SW'), 2)
(('Florida', 'NE'), 27)
(('Florida', 'NW'), 5)
(('Florida', 'SE'), 23)
(('Georgia', 'NE'), 4)
(('Georgia', 'NW'), 21)
(('Georgia', 'SE'), 5)
(('Georgia', 'SW'), 5)
(('Hawaii', 'NE'), 2)
(('Hawaii', 'NW'), 2)
(('Hawaii', 'SE'), 1)
(('Idaho', 'NW'), 7)
(('Idaho', 'SE'), 3)
(('Idaho', 'SW'), 7)
(