In [113]:
rdd0 = sc.parallelize(list(range(10)))

In [114]:
rdd0.top(5)

[9, 8, 7, 6, 5]

In [115]:
rdd = sc.textFile('book.txt')

In [116]:
rdd.first()

'The Project Gutenberg EBook of English Coins and Tokens, by '

In [117]:
rdd.flatMap(lambda x:x.split()).take(3)

['The', 'Project', 'Gutenberg']

In [118]:
def mapper(rows):
   # yield(len(rows))
    for row in rows:
        for word in row.split():
            yield word
rdd.mapPartitions(mapper)\
   .map(lambda x:(x,1))\
   .take(3)

[('The', 1), ('Project', 1), ('Gutenberg', 1)]

In [120]:
def mapper(_,rows):
   # yield(len(rows))
    for row in rows:
        for word in row.split():
            yield word
rdd.mapPartitionsWithIndex(mapper)\
   .map(lambda x:(x,1))\
   .reduceByKey(lambda x,y: x+y)\
   .take(3)

[('The', 268), ('Project', 79), ('EBook', 2)]

## Task 1

## Using RDD

In [27]:
SAT_FN = 'SAT_Results.csv'
HSD_FN = 'DOE_High_School_Directory_2014-2015.csv'

In [28]:
sat = sc.textFile(SAT_FN).cache()

In [34]:
list(enumerate(sat.first().split(',')))

[(0, 'DBN'),
 (1, 'SCHOOL NAME'),
 (2, 'Num of SAT Test Takers'),
 (3, 'SAT Critical Reading Avg. Score'),
 (4, 'SAT Math Avg. Score'),
 (5, 'SAT Writing Avg. Score')]

In [70]:
def extractScore(pid,rows):
    if pid ==0:
        next(rows)
    for row in rows:
        fields = row.split(',')
        if fields[2]!='s':
            yield(fields[0],fields[2],fields[4])

satScores=sat.mapPartitionsWithIndex(extractScore)
satScores.take(5)

[('02M047', '16', '400'),
 ('21K410', '475', '437'),
 ('30Q301', '98', '440'),
 ('17K382', '59', '374'),
 ('18K637', '35', '381')]

In [78]:
def extractScore(pid,rows):
    if pid ==0:
        next(rows)
    import csv
    for fields in csv.reader(rows):
        if fields[2]!='s':
            yield(fields[0],(int(fields[2]),int(fields[4])))

satScores=sat.mapPartitionsWithIndex(extractScore)
satScores.take(5)

[('02M047', (16, 400)),
 ('21K410', (475, 437)),
 ('30Q301', (98, 440)),
 ('17K382', (59, 374)),
 ('18K637', (35, 381))]

In [75]:
schools = sc.textFile(HSD_FN).cache()
#list(enumerate(schools.first().split(',')))
schools

DOE_High_School_Directory_2014-2015.csv MapPartitionsRDD[128] at textFile at NativeMethodAccessorImpl.java:0

In [72]:
def extractSchool(pid,rows):
    if pid ==0:
        next(rows)
    import csv
    for fields in csv.reader(rows):
        if len(fields)==58 and fields[17].isdigit():
            yield(fields[0],fields[2],int(fields[17]))

largeSchools=schools.mapPartitionsWithIndex(extractSchool)\
                    .filter(lambda x: x[2]>500)\
                    .map(lambda x:(x[0],x[1]))
largeSchools.take(5)

[('01M450', 'Manhattan'),
 ('01M539', 'Manhattan'),
 ('01M696', 'Manhattan'),
 ('02M374', 'Manhattan'),
 ('02M400', 'Manhattan')]

In [68]:
def extractSchool1(pid,rows):
    if pid ==0:
        next(rows)
    import csv
    for fields in csv.reader(rows):
        if len(fields)==58 and fields[17].isdigit():
            if int(fields[17])>500:
                yield(fields[0],fields[2])
largeSchools1=schools.mapPartitionsWithIndex(extractSchool1)
largeSchools1.take(5)

[('01M450', 'Manhattan'),
 ('01M539', 'Manhattan'),
 ('01M696', 'Manhattan'),
 ('02M374', 'Manhattan'),
 ('02M400', 'Manhattan')]

In [91]:
largeSchools.join(satScores,)\
    .values()\
    .mapValues(lambda x:(x[0],x[0]*x[1]))\
    .reduceByKey(lambda x,y:(x[0]+y[0],x[1]+y[1]))\
    .mapValues(lambda x:x[1]//x[0])\
    .take(5)

[('Bronx', 470),
 ('Staten Island', 477),
 ('Manhattan', 514),
 ('Brooklyn', 487),
 ('Queens', 474)]

## Using Data Frames

In [107]:
dfScores = spark.read.load(SAT_FN,format='csv',header = True,inferSchema=True)
dfScores = dfScores.select('DBN',
                          dfScores['Num of SAT Test Takers'].cast('int').alias('ntakers'),
                          dfScores['`SAT Math Avg. Score`'].cast('int').alias('score')
                          ).na.drop()
dfScores = dfScores.select('DBN','ntakers',
                           (dfScores.score*dfScores.ntakers).alias('total'))
dfScores.show()

+------+-------+------+
|   DBN|ntakers| total|
+------+-------+------+
|02M047|     16|  6400|
|21K410|    475|207575|
|30Q301|     98| 43120|
|17K382|     59| 22066|
|18K637|     35| 13335|
|32K403|     50| 18300|
|09X365|     54| 18306|
|11X270|     56| 22064|
|05M367|     33| 12078|
|14K404|     68| 24276|
|30Q575|    135| 66420|
|13K336|      9|  3366|
|04M635|     48| 17712|
|24Q264|     89| 40406|
|17K408|     57| 19494|
|19K618|     60| 22260|
|27Q309|     36| 13644|
|32K552|     67| 24388|
|13K499|     72| 26208|
|07X600|     76| 30400|
+------+-------+------+
only showing top 20 rows



In [111]:
dfSchools = spark.read.load(HSD_FN,format='csv',header = True,inferSchema=True)
dfSchools = dfSchools.na.drop(subset=['boro'])
dfSchools = dfSchools.filter(dfSchools['total_students']>500)
dfSchools = dfSchools.select('dbn','boro')
dfSchools.head()

Row(dbn='01M450', boro='Manhattan')

In [None]:
# largeSchools.join(satScores,)\
#     .values()\
#     .mapValues(lambda x:(x[0],x[0]*x[1]))\
#     .reduceByKey(lambda x,y:(x[0]+y[0],x[1]+y[1]))\
#     .mapValues(lambda x:x[1]//x[0])\
#     .take(5)

In [112]:
dfResults = dfSchools.join(dfScores,dfSchools.dbn==dfScores.DBN,
                          how='inner')
dfResults = dfResults.groupBy('boro').sum('ntakers','total')
dfResults = dfResults.withColumn('avg',dfResults[2]/dfResults[1])\
                     .select('boro','avg')
dfResults.show()

+-------------+------------------+
|         boro|               avg|
+-------------+------------------+
|       Queens| 474.3679400475233|
|     Brooklyn|487.46256168204246|
|Staten Island| 477.9099864130435|
|    Manhattan| 514.9312780989081|
|        Bronx|  470.198606271777|
+-------------+------------------+

