In [1]:
import pandas as pd
df=pd.read_csv("yelp.csv")
df = df.replace(r'\n',' ', regex=True)
df.to_csv("yelp_modified.csv")

In [2]:
#avg no. of words in each review

In [3]:
%%file wordcount.py
from mrjob.job import MRJob
from statistics import mean
import re

class WordCountAvg(MRJob):

    def mapper(self, _, line):
        text = re.sub(r'[^\w\s]', '', line.lower())
        yield None, sum(1 for word in text.split() if word != "blacks")

    def reducer(self, _, word_counts):
        yield "average", mean(word_counts)

if __name__ == '__main__':
    WordCountAvg.run()

Writing wordcount.py


In [5]:
import wordcount
mr_job = wordcount.WordCountAvg(args=['yelp_modified.csv'])
with mr_job.make_runner() as runner:
    runner.run()
    for key, value in mr_job.parse_output(runner.cat_output()):
        print(key,value)

No configs specified for inline runner


average 130.42485751424857


In [6]:
#count of reviews by YY-MM

In [7]:
%%file countpermonth.py
from mrjob.job import MRJob
import re

class CountPerMonth(MRJob):

    def mapper(self, _, line):
        # Use regular expression to extract date
        date_match = re.search(r'\d{4}-\d{2}', line)
        if date_match:
            year_month = date_match.group(0)
            yield year_month, 1

    def reducer(self, month, counts):
        # Sum up the counts for each month
        yield month, sum(counts)

if __name__ == '__main__':
    CountPerMonth.run()

Writing countpermonth.py


In [9]:
import countpermonth
mr_job = countpermonth.CountPerMonth(args=['yelp_modified.csv'])
with mr_job.make_runner() as runner:
    runner.run()
    for key, value in mr_job.parse_output(runner.cat_output()):
        print(key,value)

No configs specified for inline runner


2012-02 219
2012-03 259
2012-04 265
2012-05 275
2012-06 272
2012-07 281
2010-09 150
2010-10 144
2010-11 147
2010-12 160
2011-01 239
2009-04 101
2009-05 101
2009-06 67
2009-07 95
2009-08 98
2009-09 113
2009-10 101
2009-11 78
2009-12 104
2010-01 154
2010-02 148
2010-03 168
2010-04 148
2010-05 154
2010-06 118
2010-07 160
2010-08 201
2012-11 208
2012-12 196
2013-01 52
2012-08 249
2012-09 239
2012-10 258
2011-08 266
2011-09 193
2011-10 204
2008-07 80
2008-08 75
2008-09 59
2008-10 79
2008-11 66
2008-12 71
2009-01 108
2009-02 79
2009-03 126
2011-02 216
2011-03 263
2011-04 263
2011-05 229
2011-06 230
2011-07 236
2005-04 1
2005-07 2
2005-12 1
2006-01 6
2006-02 9
2006-04 2
2006-05 1
2006-06 5
2006-07 2
2006-08 9
2006-09 4
2006-10 5
2006-11 6
2006-12 6
2007-01 14
2007-02 20
2007-03 42
2007-04 8
2007-05 23
2007-06 12
2007-07 35
2007-08 29
2007-09 26
2007-10 23
2007-11 28
2007-12 25
2008-01 46
2008-02 48
2008-03 47
2008-04 53
2008-05 65
2008-06 76
2011-11 203
2011-12 249
2012-01 304


In [10]:
#avg ratings marked by cool

In [11]:
%%file cool.py
from mrjob.job import MRJob
import csv
from statistics import mean
import re

class cool(MRJob):

    def mapper(self, _, line):
        row = next(csv.reader([line]))
        id, business_id, date, review_id, stars, text, type, user_id, cool, useful, funny = row
        if re.match(r'^[0-9]+$', stars) and int(cool) != 0:
            yield None, int(stars)

    def reducer(self, _, star_ratings):
        # Computing the average star rating of cool reviews
        ratings = list(star_ratings)
        if ratings:
            yield "average", mean(ratings)

if __name__ == '__main__':
    cool.run()

Writing cool.py


In [12]:
import cool

mr_job = cool.cool(args=['yelp_modified.csv'])

with mr_job.make_runner() as runner:
    runner.run()
    for key, value in mr_job.parse_output(runner.cat_output()):
        print("average", value)

No configs specified for inline runner


average 3.8649595687331537
