## HW 5.0
What is a data warehouse? What is a Star schema? When is it used?

## HW 5.1
In the database world What is 3NF? Does machine learning use data in 3NF? If so why?   
In what form does ML consume data?  
Why would one use log files that are denormalized?  


## HW 5.2

Using MRJob, implement a hashside join (memory-backed map-side) for left, right and inner joins. 

Run your code on the data used in HW 4.4: (Recall HW 4.4: Find the most frequent visitor of each page using mrjob and the output of 4.2  (i.e., transfromed log file)). 

In this output please include the webpage URL, webpageID and Visitor ID.:

Justify which table you chose as the Left table in this hashside join.

Please report the number of rows resulting from:

1. Left joining Table Left with Table Right
2. Right joining Table Left with Table Right
3. Inner joining Table Left with Table Right



In [9]:
%load_ext autoreload
%autoreload 2

In [24]:
%%writefile MRJoin_5_2.py

from mrjob.job import MRJob, MRStep
import mrjob
import csv
import sys

class MRJoin(MRJob):
    urls = {} # key = pageId, value = url
    keys_emitted = set() # Set of keys of all emitted urls. Used for left join.
    
    def configure_options(self):
        super(MRJoin, self).configure_options()
        self.add_passthrough_option(
            '--joinType', type='str', default="inner")
        
    def load_options(self, args):
        super(MRJoin, self).load_options(args)
        self.joinType = self.options.joinType
        
    def mapper_init(self):
        # Load URL info data file into memory.  
        # Line format: 
        # 1287,/autoroute
        with open("processed_urls.data", "r") as f:
            for fields in csv.reader(f):
                self.urls[fields[0]] = fields[1]

    def mapper(self, line_no, line):
        # Line format: 
        # V,1000,1,C,10001
        fields = csv.reader([line]).next()
        
        key = fields[1]
        url = None
        toEmit = False
        
        if key in self.urls:
            url = self.urls[key]
            
        if self.joinType == "right":
            toEmit = True
        elif self.joinType == "left":
            if url is not None:
                toEmit = True
                self.keys_emitted.add(key) # Remember what we have emitted
        else: # inner join
            if url is not None:
                toEmit = True
        
        if toEmit:
            # Output format
            # pageid, url,V,1,C,10001
            yield key, (url, fields[0], fields[2], fields[3], fields[4])
        
    def mapper_final(self):
        if self.joinType == "left":
            # Emit all the remaining urls
            remaining = set(self.urls.keys()) - self.keys_emitted
            for key in remaining:
                yield key, (self.urls[key], None, None, None, None)
                                
    def steps(self):
        return [
            MRStep(mapper_init=self.mapper_init,
                   mapper=self.mapper,
                   mapper_final=self.mapper_final)
            ]
    
if __name__ == '__main__':
    MRJoin.run()


Overwriting MRJoin_5_2.py


In [27]:
from MRJoin_5_2 import MRJoin

for joinType in ["left", "right", "inner"]:
    mr_job = MRJoin(args=['processed_anonymous-msweb.data', 
                        '--file', 'processed_urls.data', # broadcast to every mapper
                        "--strict-protocols",
                        '--joinType', joinType])

    with mr_job.make_runner() as runner: 
        runner.run()

        lines = []
        for line in runner.stream_output():
            lines.append(line)
            
        print "Join type:" + joinType
        print "Number of records:" + str(len(lines))
        print "First 5 lines:"
        for i in range(5):
            print lines[i].strip()
            
        print


Join type:left
Number of records:98663
First 5 lines:
"1000"	["/regwiz", "V", "1", "C", "10001"]
"1001"	["/support", "V", "1", "C", "10001"]
"1002"	["/athome", "V", "1", "C", "10001"]
"1001"	["/support", "V", "1", "C", "10002"]
"1003"	["/kb", "V", "1", "C", "10002"]

Join type:right
Number of records:98654
First 5 lines:
"1000"	["/regwiz", "V", "1", "C", "10001"]
"1001"	["/support", "V", "1", "C", "10001"]
"1002"	["/athome", "V", "1", "C", "10001"]
"1001"	["/support", "V", "1", "C", "10002"]
"1003"	["/kb", "V", "1", "C", "10002"]

Join type:inner
Number of records:98654
First 5 lines:
"1000"	["/regwiz", "V", "1", "C", "10001"]
"1001"	["/support", "V", "1", "C", "10001"]
"1002"	["/athome", "V", "1", "C", "10001"]
"1001"	["/support", "V", "1", "C", "10002"]
"1003"	["/kb", "V", "1", "C", "10002"]



## HW 5.3  EDA of Google n-grams dataset
For the Google n-grams dataset unit test and regression test your code using the 
first 10 lines of the following file:  
``googlebooks-eng-all-5gram-20090715-0-filtered.txt``

Finally show your results on the Google n-grams dataset. 

In particular, this bucket contains (~200) files (10Meg each) in the format:

``	(ngram) \t (count) \t (pages_count) \t (books_count)``

Do some EDA on this dataset using mrjob, e.g., 

- Longest 5-gram (number of characters)
- Top 10 most frequent words (please use the count information), i.e., unigrams
- 20 Most/Least densely appearing words (count/pages_count) sorted in decreasing order of relative frequency 
- Distribution of 5-gram sizes (character length).  E.g., count (using the count field) up how many times a 5-gram of 50 characters shows up. Plot the data graphically using a histogram.


In [2]:
%load_ext autoreload
%autoreload 2

In [58]:
%%writefile MrLongest_5_3a.py

from mrjob.job import MRJob, MRStep
import mrjob
import csv
import sys

# Find Longest 5-gram (number of characters)
class MrLongest(MRJob):
    def mapper(self, line_no, line):
        # Line format: 
        # (ngram) \t (count) \t (pages_count) \t (books_count)
        fields = csv.reader([line], delimiter='\t').next()        
        ngram = fields[0]
        yield len(ngram), ngram

    def reducer_init(self):
        self.emitted = False
        
    def reducer(self, length, values):
        # We only need to emit the first one, which is the longest for this reducer
        if not self.emitted:
            self.emitted = True
            ngrams = [ngram for ngram in values]
            yield length, ngrams
                                
    def steps(self):
        return [
            MRStep(
                mapper=self.mapper,
                reducer_init=self.reducer_init,
                reducer=self.reducer,
                # First key is length; sort it in reverse order
                jobconf={
                    "stream.num.map.output.key.fields":"1",
                    "mapred.output.key.comparator.class":
                        "org.apache.hadoop.mapred.lib.KeyFieldBasedComparator",
                    "mapred.text.key.comparator.options":"-k1,1nr"
                          }                
                  )
            ]
    
if __name__ == '__main__':
    MrLongest.run()


Overwriting MrLongest_5_3a.py


#### Unit Tests

In [33]:
# Prepare test file
!head -n 10 googlebooks-eng-all-5gram-20090715-0-filtered.txt > testData.txt

In [83]:
%%writefile unitTest_5_3a.py
import unittest
from MrLongest_5_3a import MrLongest

class UnitTest_5_3(unittest.TestCase):
    
    def __init__(self, *args, **kwargs):
        super(UnitTest_5_3, self).__init__(*args, **kwargs)
        with open('testData.txt', 'r') as f:
            self.first_line = f.readline()
            self.first_ngram = self.first_line.split('\t')[0]
        
    def test_MrLongest_mapper(self):
        j = MrLongest()
        self.assertEqual(j.mapper(None, self.first_line).next(), 
                         (len(self.first_ngram), self.first_ngram))
        
    def test_MrLongest_reducer(self):
        j = MrLongest()
        ngrams = ["0123456789", "A12345678B"]
        length = len(ngrams[0])
        
        j.reducer_init()
        self.assertEqual(j.reducer(length, ngrams).next(), (length, ngrams))

        # We only output the first one.
        with self.assertRaises(StopIteration):
            j.reducer(length, ngrams).next()
        
if __name__ == '__main__':
    unittest.main()

Writing unitTest_5_3a.py


In [84]:
!python unitTest_5_3a.py

..
----------------------------------------------------------------------
Ran 2 tests in 0.004s

OK


#### Full test

In [81]:
%%writefile fullTest_5_3.py
import unittest
from MrLongest_5_3a import MrLongest

class FullTest_5_3a(unittest.TestCase):

    def test_full(self):
        mr_job = MrLongest(
            args=['testData.txt', 
                  # Have to use Hadoop, otherwise custom sort order won't work.
                  '-r', 'hadoop', 
                  '--strict-protocols',
                  # so options from local mrjob.conf don't pollute the test env.
                  '--no-conf', 
                 ])

        results = []
        with mr_job.make_runner() as runner:
            runner.run()
            for line in runner.stream_output():
                # Use the job's specified protocol to read the output
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

        self.assertEqual(len(results), 1)
        self.assertEqual(results[0], 
                (33, ['A Circumstantial Narrative of the', 'A BILL FOR ESTABLISHING RELIGIOUS']))

if __name__ == '__main__':
    unittest.main()

Overwriting fullTest_5_3.py


In [82]:
!python fullTest_5_3.py

No handlers could be found for logger "mrjob.compat"
.
----------------------------------------------------------------------
Ran 1 test in 37.503s

OK
