In [1]:
sc

<pyspark.context.SparkContext at 0x633ab00>

### Finding the average amount of people using the MTA during different weather conditions during 2011-2016

In [2]:
def mapper1(index,data):
    # skip header row
    if index==0:
        data.next()
    import csv
    reader = csv.reader(data)
    for row in reader:
        event = row[4]
        numPeople = row[5].split('.')[0]
        if event == 'Normal':
            yield ('Clear Skies', int(numPeople))
        else:
            yield (event, int(numPeople))
        
def mapper2(index,data):
    # skip header row
    if index==0:
        data.next()
    import csv
    reader = csv.reader(data)
    for row in reader:
        date = row[1]
        event = row[4]
        if event == 'Normal':
            yield ((date, 'Clear Skies'), 1)
        else:
            yield ((date, event), 1)

mtaData = sc.textFile('dataset/clean-mta-data/clean-mta-data.csv',use_unicode=False).cache()

# sum up all the entry by weather
# returns (event, total_people)
rdd1 = mtaData.mapPartitionsWithIndex(mapper1) \
                .reduceByKey(lambda x,y: x+y)

# count how many days that have certain weather
# d = date; e = event; c = count;
# returns (event, count)
rdd2 = mtaData.mapPartitionsWithIndex(mapper2) \
                .reduceByKey(lambda x,y: x+y) \
                .map(lambda ((d, e), c): (e, c)) \
                .reduceByKey(lambda x,y: x+y)

# get average Entry in day for weather
# d = date; e = event; c = count; s = summation
# returns (event, average_peple_per_weather)
rdd3 = rdd1.join(rdd2) \
           .map(lambda (e, (s, c)): (e, s/c)) \
           .collect()
rdd3

[('Rain', 159L),
 ('Rain , Snow', 159),
 ('Fog , Snow', 126),
 ('Fog , Rain', 156),
 ('Clear Skies', 157L),
 ('Snow', 151),
 ('Fog', 171),
 ('Fog , Rain , Snow', 146),
 ('Thunderstorm', 75)]

### Find the average amount of people using the MTA by day of the week and weather condition

In [3]:
def mapper3(index,data):
    # skip header row
    if index==0:
        data.next()
    import csv
    import datetime
    reader = csv.reader(data)
    for row in reader:
        date = datetime.datetime.strptime(row[1], '%m/%d/%Y').weekday()
        event = row[4]
        numPeople = row[5].split('.')[0]
        if event == 'Normal':
            yield ((date, 'Clear Skies'), int(numPeople))
        else:
            yield ((date,event), int(numPeople))
            
def mapper4(index,data):
    # skip header row
    if index==0:
        data.next()
    import csv
    import datetime
    reader = csv.reader(data)
    for row in reader:
        date = datetime.datetime.strptime(row[1], '%m/%d/%Y').weekday()
        event = row[4]
        if event == 'Normal':
            yield ((date, 'Clear Skies'), 1)
        else:
            yield ((date, event), 1)

import operator
mtaData = sc.textFile('dataset/clean-mta-data/clean-mta-data.csv',use_unicode=False).cache()

# get the amount of people using the mta on each day of the week for different weathers
# returns ((weekday, event), summation_of_passengers)
rdd4 = mtaData.mapPartitionsWithIndex(mapper3)  \
                .reduceByKey(lambda x,y: x+y)

# get the number of days by each day of the week (hour) for different weathers
# returns ((weekday, event), summation_of_day_hour)
rdd5 = mtaData.mapPartitionsWithIndex(mapper4) \
                .reduceByKey(lambda x,y: x+y) \
    
rdd6 = rdd4.join(rdd5) \
           .map(lambda (e, (s, c)): (e, s/c)) \
           .collect()
rdd6

[((6, 'Rain'), 86),
 ((6, 'Clear Skies'), 88),
 ((1, 'Fog , Snow'), 151),
 ((0, 'Fog , Snow'), 118),
 ((5, 'Fog , Rain , Snow'), 98),
 ((3, 'Rain , Snow'), 181),
 ((3, 'Fog , Rain , Snow'), 155),
 ((3, 'Clear Skies'), 185),
 ((5, 'Snow'), 107),
 ((2, 'Fog'), 194),
 ((3, 'Fog , Snow'), 136),
 ((4, 'Rain , Snow'), 184),
 ((0, 'Snow'), 173),
 ((2, 'Clear Skies'), 186),
 ((4, 'Snow'), 178),
 ((0, 'Clear Skies'), 173),
 ((3, 'Fog , Rain'), 177),
 ((1, 'Fog'), 164),
 ((0, 'Rain , Snow'), 161),
 ((4, 'Fog , Rain'), 177),
 ((0, 'Fog , Rain'), 170),
 ((2, 'Snow'), 190),
 ((5, 'Fog , Snow'), 93),
 ((6, 'Fog'), 82),
 ((4, 'Fog , Snow'), 178),
 ((6, 'Snow'), 85),
 ((1, 'Fog , Rain'), 183),
 ((2, 'Rain , Snow'), 186),
 ((3, 'Snow'), 188),
 ((6, 'Rain , Snow'), 90),
 ((1, 'Fog , Rain , Snow'), 187),
 ((2, 'Fog , Snow'), 135),
 ((6, 'Fog , Rain'), 76),
 ((4, 'Fog , Rain , Snow'), 179),
 ((0, 'Rain'), 170),
 ((2, 'Fog , Rain'), 177),
 ((3, 'Fog'), 185),
 ((5, 'Rain , Snow'), 109),
 ((0, 'Fog , Rain , 