## Using the MRJob Class below  calculate the  KL divergence of the following two objects.

In [None]:
%%writefile kltext.txt
1.Data Science is an interdisciplinary field about processes and systems to extract knowledge or insights from large volumes of data in various forms (data in various forms, data in various forms, data in various forms), either structured or unstructured,[1][2] which is a continuation of some of the data analysis fields such as statistics, data mining and predictive analytics, as well as Knowledge Discovery in Databases.
2.Machine learning is a subfield of computer science[1] that evolved from the study of pattern recognition and computational learning theory in artificial intelligence.[1] Machine learning explores the study and construction of algorithms that can learn from and make predictions on data.[2] Such algorithms operate by building a model from example inputs in order to make data-driven predictions or decisions,[3]:2 rather than following strictly static program instructions.

## MRjob class for calculating pairwise similarity using K-L Divergence as the similarity measure

Job 1: create inverted index (assume just two objects) <P>
Job 2: calculate the similarity of each pair of objects 

In [None]:
import numpy as np
np.log(3)

In [1]:
%%writefile kldivergence.py
#!/usr/bin/env python
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
import numpy as np

class kldivergence(MRJob):
    ## line 1 = P
    ## line 2 = Q
    
    def mapper1(self, _, line):
        index = int(line.split('.',1)[0])
        ## replaces everything that is not a letter into nothing
        # all letters get smushed into one big line
        letter_list = re.sub(r"[^A-Za-z]+", '', line).lower()
        count = {}
        # Count occurances of each character
        for l in letter_list:
            if count.has_key(l):
                count[l] += 1
            # still counts 1, so no log(0)
            else:
                count[l] = 1
        for key in count:
            yield key, [index, count[key]*1.0/len(letter_list)]


    def reducer1(self, letter, index_prior_pair):
        #Fill in your code
        # emit partial sums of KLD for each i
        # where i is a letter
        P_dict = {}
        Q_dict = {}
        
        for index, prior in index_prior_pair:
            if index == 1:
                P_dict.setdefault(letter,1)
                P_dict[letter] = float(prior)
            
            if index == 2:
                Q_dict.setdefault(letter,1)
                Q_dict[letter] = float(prior)
            
            # partial sum can also be written as:
            # p(i)*log(p(i) - p(i)*log(q(i)) 
            # --> if q(i) is 0, second term is basically 0 
            
            for key in P_dict.keys():
                term1 = P_dict[key] * np.log(P_dict[key])
                try: 
                    term2 = P_dict[key] * np.log(Q_dict[key])
                    partial_sum = term1 - term2
                    yield key, partial_sum
                except KeyError:
                    partial_sum = 0
                    yield key, partial_sum
                
            
    def combiner2(self, key, values):
        kl_sum = 0
        for value in values:
            kl_sum = kl_sum + value
        yield None, kl_sum
        
    def reducer2(self, key, values):
        kl_sum = 0
        for value in values:
            kl_sum = kl_sum + value
        yield None, kl_sum
        
            
    def steps(self):
        return [MRStep(mapper=self.mapper1,
                        reducer=self.reducer1),
                MRStep(combiner = self.combiner2,
                       reducer=self.reducer2)]

if __name__ == '__main__':
    kldivergence.run()

Overwriting kldivergence.py


In [2]:
from kldivergence import kldivergence
mr_job = kldivergence(args=['kltext.txt'])
with mr_job.make_runner() as runner: 
    runner.run()
    # stream_output: get access of the output 
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)

(None, 0.08088278445318145)


In [None]:
!chmod a+x kldivergence.py
!./kldivergence.py kltext.txt >kltextoutput.txt

In [None]:
!cat ./kltextoutput.txt

In [13]:
## Part 7:
import string, re
alphabet=string.ascii_lowercase

str1 = "Data Science is an interdisciplinary field about processes and systems to extract knowledge or insights from large volumes of data in various forms (data in various forms, data in various forms, data in various forms), either structured or unstructured,[1][2] which is a continuation of some of the data analysis fields such as statistics, data mining and predictive analytics, as well as Knowledge Discovery in Databases"
str2 = "Machine learning is a subfield of computer science[1] that evolved from the study of pattern recognition and computational learning theory in artificial intelligence.[1] Machine learning explores the study and construction of algorithms that can learn from and make predictions on data.[2] Such algorithms operate by building a model from example inputs in order to make data-driven predictions or decisions,[3]:2 rather than following strictly static program instructions"

tot_str = str1+str2

tot_str = re.sub(r"[^A-Za-z]+", '', tot_str).lower()

#print set(tot_str)
for i in alphabet:
    if i not in tot_str:
        print i

j
q
z


In [11]:
print alphabet

abcdefghijklmnopqrstuvwxyz


In [14]:
%%writefile kldivergencesmooth.py
#!/usr/bin/env python
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
import numpy as np

class kldivergencesmooth(MRJob):
    ## line 1 = P
    ## line 2 = Q
    
    def mapper1(self, _, line):
        index = int(line.split('.',1)[0])
        ## replaces everything that is not a letter into nothing
        # all letters get smushed into one big line
        letter_list = re.sub(r"[^A-Za-z]+", '', line).lower()
        count = {}
        # Count occurances of each character
        for l in letter_list:
            if count.has_key(l):
                count[l] += 1
            # still counts 1, so no log(0)
            else:
                count[l] = 1
        for key in count:
            yield key, [index, (count[key]*1.0+1)/(len(letter_list)+24)]


    def reducer1(self, letter, index_prior_pair):
        #Fill in your code
        # emit partial sums of KLD for each i
        # where i is a letter
        P_dict = {}
        Q_dict = {}
        
        for index, prior in index_prior_pair:
            if index == 1:
                P_dict.setdefault(letter,1)
                P_dict[letter] = float(prior)
            
            if index == 2:
                Q_dict.setdefault(letter,1)
                Q_dict[letter] = float(prior)
            
            # partial sum can also be written as:
            # p(i)*log(p(i) - p(i)*log(q(i)) 
            # --> if q(i) is 0, second term is basically 0 
            
            for key in P_dict.keys():
                term1 = P_dict[key] * np.log(P_dict[key])
                try: 
                    term2 = P_dict[key] * np.log(Q_dict[key])
                    partial_sum = term1 - term2
                    yield key, partial_sum
                except KeyError:
                    partial_sum = 0
                    yield key, partial_sum
                
            
    def combiner2(self, key, values):
        kl_sum = 0
        for value in values:
            kl_sum = kl_sum + value
        yield None, kl_sum
        
    def reducer2(self, key, values):
        kl_sum = 0
        for value in values:
            kl_sum = kl_sum + value
        yield None, kl_sum
        
            
    def steps(self):
        return [MRStep(mapper=self.mapper1,
                        reducer=self.reducer1),
                MRStep(combiner = self.combiner2,
                       reducer=self.reducer2)]

if __name__ == '__main__':
    kldivergencesmooth.run()

Writing kldivergencesmooth.py


In [15]:
from kldivergencesmooth import kldivergencesmooth
mr_job = kldivergencesmooth(args=['kltext.txt'])
with mr_job.make_runner() as runner: 
    runner.run()
    # stream_output: get access of the output 
    for line in runner.stream_output():
        print mr_job.parse_output_line(line)

(None, 0.06726997279170045)
