In [1]:
sc

<pyspark.context.SparkContext at 0x63f6b00>

### Finding the average amount of people using the MTA during different weather conditions during 2011-2016

In [2]:
def mapper1(index,data):
    # skip header row
    if index==0:
        data.next()
    import csv
    reader = csv.reader(data)
    for row in reader:
        if row[1].split('/')[2] != '2010':
            date = row[1]
            event = row[4]
            numPeople = row[5].split('.')[0]
            if event == 'Normal':
                yield ((date,'Clear Skies'), int(numPeople))
            else:
                yield ((date,event), int(numPeople))
        
def mapper2(index,data):
    # skip header row
    if index==0:
        data.next()
    import csv
    reader = csv.reader(data)
    for row in reader:
        if row[1].split('/')[2] != '2010':
            date = row[1]
            event = row[4]
            if event == 'Normal':
                yield ((date, 'Clear Skies'), 1)
            else:
                yield ((date, event), 1)

mtaData = sc.textFile('dataset/clean-mta-data/clean-mta-data.csv',use_unicode=False).cache()

# sum up all the entry by weather
# returns (event, total_people)
rdd1 = mtaData.mapPartitionsWithIndex(mapper1) \
                .reduceByKey(lambda x,y: x+y) \
                .map(lambda ((d,e),s): (e,s)) \
                .reduceByKey(lambda x,y: x+y)

# count how many days that have certain weather
# d = date; e = event; c = count;
# returns (event, count)
rdd2 = mtaData.mapPartitionsWithIndex(mapper2) \
               .distinct() \
               .map(lambda ((d,e),c): (e,c)) \
               .reduceByKey(lambda x,y: x+y)

# get average Entry in day for weather
# d = date; e = event; c = count; s = summation
# returns (event, average_peple_per_weather)
rdd3 = rdd1.join(rdd2) \
           .map(lambda (e, (s, c)): (e, s/c)) \
           .collect()
rdd3

[('Rain', 4715522L),
 ('Rain , Snow', 4619726),
 ('Fog , Snow', 3844535),
 ('Fog , Rain', 4722427),
 ('Clear Skies', 4639989L),
 ('Snow', 4354041),
 ('Fog', 5243310),
 ('Fog , Rain , Snow', 4365354),
 ('Thunderstorm', 2311591)]

### Find the average amount of people using the MTA by day of the week and weather condition

In [3]:
def mapper3(index,data):
    # skip header row
    if index==0:
        data.next()
    import csv
    import datetime
    reader = csv.reader(data)
    for row in reader:
        if row[1].split('/')[2] != '2010':
            date = datetime.datetime.strptime(row[1], '%m/%d/%Y').strftime("%A")
            event = row[4]
            numPeople = row[5].split('.')[0]
            if event == 'Normal':
                yield ((date, 'Clear Skies'), int(numPeople))
            else:
                yield ((date,event), int(numPeople))
            
def mapper4(index,data):
    # skip header row
    if index==0:
        data.next()
    import csv
    import datetime
    reader = csv.reader(data)
    for row in reader:
        if row[1].split('/')[2] != '2010':
            date = datetime.datetime.strptime(row[1], '%m/%d/%Y').strftime("%A")
            event = row[4]
            if event == 'Normal':
                yield ((date, 'Clear Skies'), 1)
            else:
                yield ((date, event), 1)

mtaData = sc.textFile('dataset/clean-mta-data/clean-mta-data.csv',use_unicode=False).cache()

# get the amount of people using the mta on each day of the week for different weathers
# returns ((weekday, event), summation_of_passengers)
rdd4 = mtaData.mapPartitionsWithIndex(mapper3)  \
                .reduceByKey(lambda x,y: x+y)

# get the number of days by each day of the week (hour) for different weathers
# returns ((weekday, event), summation_of_day_hour)
rdd5 = mtaData.mapPartitionsWithIndex(mapper4) \
                .distinct() \
                .reduceByKey(lambda x,y: x+y) \
        
# get the average number of people using the MTA on different days of the week and weather condition
# returns ((weekday, event), avg)
rdd6 = rdd4.join(rdd5) \
           .map(lambda (e, (s, c)): (e, s/c)) \
           .sortByKey(True) \
           .collect()

rdd6

[(('Friday', 'Clear Skies'), 1147607261),
 (('Friday', 'Fog , Rain'), 100895970),
 (('Friday', 'Fog , Rain , Snow'), 16036017),
 (('Friday', 'Fog , Snow'), 26094128),
 (('Friday', 'Rain'), 339233210),
 (('Friday', 'Rain , Snow'), 5548018),
 (('Friday', 'Snow'), 36474706),
 (('Monday', 'Clear Skies'), 1063972752),
 (('Monday', 'Fog'), 16086797),
 (('Monday', 'Fog , Rain'), 90753281),
 (('Monday', 'Fog , Rain , Snow'), 18767182),
 (('Monday', 'Fog , Snow'), 7936418),
 (('Monday', 'Rain'), 336108880),
 (('Monday', 'Rain , Snow'), 30425522),
 (('Monday', 'Snow'), 39979332),
 (('Saturday', 'Clear Skies'), 663604977),
 (('Saturday', 'Fog , Rain'), 24319362),
 (('Saturday', 'Fog , Rain , Snow'), 14357643),
 (('Saturday', 'Fog , Snow'), 18822966),
 (('Saturday', 'Rain'), 201335326),
 (('Saturday', 'Rain , Snow'), 5952711),
 (('Saturday', 'Snow'), 33980296),
 (('Sunday', 'Clear Skies'), 545105697),
 (('Sunday', 'Fog'), 2413018),
 (('Sunday', 'Fog , Rain'), 38022551),
 (('Sunday', 'Fog , Snow'),