# Text Analysis with Spark RDD API

@Author: USYD COMP5349  
@Date: Mon ~ Thu from 4.11  
@comment1: Print intermediate output is good for debugging.  
@comment2: Create a small working example to test if output works.  
@comment3: Cannot read mutiple csvs file into one RDD if format different.  
@comment4: Emoji is used to highligh important output.  
@comment5: Google style comment is used without the raises keyword.  
@comment6: Code modified based on various Ed posts.

# Import & global variable

In [1]:
import re
import csv
import string
from collections import Counter

import nltk
from nltk.corpus import stopwords as sw
from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
# nltk.download('wordnet', quiet=True) # for Lemmatizer and better keyword extraction

!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("COMP5349 A1 Utilities") \
    .getOrCreate()

# from pyspark import SparkConf, SparkContext

# spark_conf = SparkConf().setAppName("A1")
# sc=SparkContext.getOrCreate(spark_conf) 

In [3]:
governing = spark.read.csv('data/Governing_Law.csv',header=True).rdd  
assignment = spark.read.csv('data/Anti_assignment_CIC_g3.csv',header=True).rdd  

# Convert list to set since it makes checking faster because it uses hash table 
sww = set(sw.words('english') + ['and/or', 'shall', 'thereof', 'without', 'may', 'either', 'neither'])
puncs = set(string.punctuation) 
delimiters = sww | puncs # join two set 
# lemmatizer = WordNetLemmatizer()

# Method 1

## UDF

### assign_row_num

In [4]:
def assign_row_num(row, col:str) -> list:
    """
    Assign a unique id to each row in the csv.
    Args:
        row(spark.csv): raw csv input
        col(str): column name
    Returns:
        res(List[Tuple]): (count, clauses)

    The easiest and better way is to use spark sql / pandas df to add row number,
    however, it is not allowed and so this wasty appraoch is use for better RDD/database-ish design.
    This is because the user defined function takes input one by one and the global counter is not reliable since distributed system.
    Thus, convert the [e1, e2, ...] to (1, [e1, e2, ...]) format
    Then loop through the second elem which is a list and assign row num.
    Why not use filename as the key, it is too expensive when compare key
    """
    row_list = row[1]
   
    res = []
    count = 0
    for row in row_list:
        row_dict = row.asDict()
        clauses = row_dict[col]
        res.append((count, clauses))
        count += 1
    return res 

### split clause

In [5]:
def split_clause(row) -> list:
    """
    Split each row into clauses/documents  by the pattern (page x), then remove (page x)
    Args:
        row(spark.csv): raw csv input
    Returns:
        clause_list(List[Tuple]): (id, clause)
    Note that unique id is in the form: file_number-nth ones in that cell
    """
    file_number, clauses = row
    tokenised = word_tokenize(clauses)
    
    # convert all to lower case, 
    tokenised = [w.lower() for w in tokenised] 
    # e.g. conflicts -> conflict
    # tokenised = [lemmatizer.lemmatize(w) for w in tokenised]
    clause_list = []
    cur_page_idx = 0
    
    count = 0
    for idx, w in enumerate(tokenised):
        # page followed by number pattern
        
        if w.lower() in ['page', 'pages']: 
            try:
                # next word first letter if digit
                if tokenised[idx+1][0].isnumeric(): 
                    # print(cur_page_idx, idx)
                    # everybefore is a clause
                    cutoff = tokenised[cur_page_idx:idx - 1]
                    # ignore single character
                    cutoff = [w for w in cutoff if len(w) > 1]
                    clause_list.append((str(file_number)+'-'+ str(count), cutoff)) # ignroe the '('
                    # update the page index for the next clause
                    # '(', 'Pages', '57-58', ')' so + 3
                    cur_page_idx = idx + 3 
                    count += 1
            except:
                pass
        
    # list of (file_number-clauseNum, count, clause) e.g. 11-3
    return clause_list 

assume page follow by digit is the end of a clause.

### get_candidate_phrase

In [6]:
def get_candidate_phrase(input_:list) -> tuple:
    """
    Split each row into clauses/documents  by the pattern (page x), then remove (page x)
    Args:
        input_(list): a list of individual clause/document
    Returns:
        clause_list(List[Tuple]): (clause_number, content_word_seq, content_words, word_freq)
    Note: 
    1. All content word is Unique.
    2. Assume the maximum length of a keyword is four words. For both two methods.
    3. Assume the page pattern is: (page, n)
    4. Allow duplicate candidate phrase 
    """
    max_seq_len = 4

    clause_number, clause_tokenised = input_

    # candidate phrase
    content_word_seq = [[],] 
    for w in clause_tokenised:
        if w not in sww and w not in puncs: # TODO: now assume the last char is the puncation, loop back to find all punct
            content_word_seq[-1].append(w)

        else: # Here see a delimeter, and thus start new list
            
            last_seq = content_word_seq[-1]
            
            if last_seq: # if last sequence not empty
                
                # check if the last one already exist in the current sequences collection, if so remove it
                # check if exceed the max seq len, if so remove
                # TODO: is_unique_seq = last_seq not in content_word_seq[:-1]
                is_within_len = len(last_seq) <= max_seq_len
                # conditions = []
                
                if not is_within_len: # all(is_unique_seq, conditions): # if any of them is false
                    del content_word_seq[-1]
                                
                # start a new list to collect the next seq
                content_word_seq.append([]) 
    # remove the last seq if it is empty           
    if not content_word_seq[-1]:
        content_word_seq = content_word_seq[:-1]
    
    # get content word - unqiue
    content_words = []
    
    for seq in content_word_seq:
        for w in seq:
            if w not in content_words:
                content_words.append(w)

    # number of times a word occurs in the document
    cleaned = []
    for w in clause_tokenised:
        if w not in sww and w not in puncs:
            cleaned.append(w)
    word_freq = dict(Counter(cleaned))

    return (clause_number, content_word_seq, content_words, word_freq)


### calculate score

In [7]:
def calculate_score_1(cand_data:tuple) -> list:
    """
    Calcuate the score for each candidate phrase by adding up individual unqiue word it consists of
    Args:
        cand_data(tuple): (clause_number, candidate_phrase_list, content_words, word_freq)
    Returns:
        cand_score_list(List[Tuple]): (clause_number, cand_phrase, score)
    Note:
    1. co-occurrence count of a word with itself always equals its frequency
    2. if a word appears in a two-word candidate phrase, it co-occurs with the other word once
       If a word appears in a three-word candidate phrase, it co-occurs with each of the other two words once.
    3. Degree = Sum of its freq & co-occuraences with other words
    4. content word score = ratio of deg to freq
    5. condidate phrase score = sum of its member score
    """
    clause_number, candidate_phrase_list, content_words, word_freq = cand_data

    word_score = {}
    # score for each word
    for candidate_phrase in candidate_phrase_list:
        co_occurance = 0
        degree = 0
        for w in content_words:
            if not w in candidate_phrase:                
                continue
            cand_freq = dict(Counter(candidate_phrase))
            # for a phrase, co occurance with other words = occurance of all word - occurance of the current word 
            co_occurance = len(candidate_phrase) - cand_freq[w]
            degree += co_occurance + word_freq[w]
            
            if w in word_score.keys():
                word_score[w] += degree / word_freq[w] 
            else:
                word_score[w] = degree / word_freq[w] 

    # score for eacn candidate
    cand_score_list = []
    for cand_phrase in candidate_phrase_list:
        score = 0
        # use set to avoid count duplicate word score twice
        for w in set(cand_phrase):
            score += word_score[w]
        cand_score_list.append((clause_number, cand_phrase, score))

    return cand_score_list

### extract_cand_phrase_top4

In [8]:
def extract_cand_phrase_top4(data_) -> tuple:
    """
    Extract top 4 candidate phrases from a document based on their score, no duplicate allow
    Args:
        data_(tuple): (doc_num, cand_phrase_score_list), cand_phrase_score_list: (id, [(phrase, score)])
    Returns:
        final(List[Tuple]): (phrase, docId)
    """
    TOP = 4
    doc_num, cand_phrase_score_list = data_

    # remove duplicate 
    phrase_uniq = []
    res = []

    for c in cand_phrase_score_list:
        if c[0] not in phrase_uniq:
            phrase_uniq.append(c[0])
            res.append(c)
    # top 4 unique phrase for a document
    res = sorted(res, key=lambda x: x[1],  reverse = True)[:TOP]
    res = [(' '.join(k), v) for k, v in res]
    final = []
    for r in res:
        # add doc num to reuse this rdd again for edf section
        final.append((r[0], doc_num))
    return final 

### calculate_rdf

In [9]:
def calculate_rdf(data_) -> tuple:
    """
    Calculate rdf based on number of document choose that  phrase as its candidate 
    Args:
        data_(tuple): phrase, list of document ids, (phrase: str, [id, ...]
    Returns:
        final(tuple): (phrase, (rdf, 'rdf'))
    """
    phrase, list_ = data_
    # unqiue doc count
    doc_unique_list = []
    score = None
    for doc_num in list_:
        assert '-' in doc_num # double check that this is the doc num format
        if doc_num not in doc_unique_list:
            doc_unique_list.append(doc_num)
    rdf = len(doc_unique_list)
    return (phrase, (rdf, 'rdf'))

### calculate_ess

In [10]:
def calculate_ess(data_) -> tuple:
    """
    Calculate ess the formula
    Args:
        data_(tuple): (phrase, ((edf, 'edf'), (rdf, 'rdf')))
    Returns:
        result(tuple): ((phrase, rdf, edf, ess)
    """
    phrase, tuple_ = data_
    edf, rdf = tuple_
    edf = edf[0]
    rdf = rdf[0]
   
    ess = round(edf / rdf * edf, 5)
    return (phrase, rdf, edf, ess) 

## Main Flow

In [11]:
def filter_col(row, col:str) -> bool:
    """check if col name exists"""
    return col in list(row.asDict().keys())

def print_take_1(rdd_, descriptor=''):
    global count_print_1
    p = rdd_.take(1)
    print(f'💎 {count_print_1}: ', descriptor + ': ')
    print(p, sep='\n')
    print()
    count_print_1 += 1

def rake_1_pipeline(rdd, col:str):
    """main logic, print out final result in the format: keyword, rdf, edf and ess scores."""

    # remove all nan and none value
    no_nan = rdd.filter(lambda row: row.asDict()[col] != 'nan' and row.asDict()[col])
    # (1, [row1, row2])
    row_num_pre = no_nan.map(lambda row: (1, [row])).reduceByKey(lambda a, b: a + b) 
    print_take_1(row_num_pre, 'Make row iterable for later assigning unique id')

    # (count, clauses), loop through list of rows and assign row number
    row_num = row_num_pre.flatMap(lambda x: assign_row_num(x, col))
    print_take_1(row_num, 'Loop through row and assign unique id')

    # (id, tokenised clause)
    clause_list = row_num.flatMap(split_clause) # list of list of tokenised words, use flat since duplicate key
    print_take_1(clause_list, 'Split clauses within a cell')

    # (clause_number, content_word_seq, content_words, word_freq)
    cand_phrases = clause_list.map(get_candidate_phrase)
    print_take_1(cand_phrases, '💎 Candidate Phrase:    Given a document, obtain calculation required data:  (clause_number, content_word_seq, content_words, word_freq)')

    # (clause_number, cand_phrase, score)
    cand_phrase_score = cand_phrases.flatMap(calculate_score_1)
    print_take_1(cand_phrase_score, '💎 Candidate Phrase score & Word Score Calculation:   Given list of phrases in a document, calcuate score for each candidate phrase')

    # (clause_number, cand_phrase, score) -> ( clause_number, (cand_phrase, score) )
    cand_phrase_score_remap = cand_phrase_score.map( lambda x: (x[0], (x[1], x[2]) ) )

    # (clause_number, [(phase, score)])
    cand_phrase_score_groupkey = cand_phrase_score_remap.groupByKey().mapValues(list)
    print_take_1(cand_phrase_score_groupkey, 'Group by document id')

    # (phrase, docId)
    cand_phrases_top4 = cand_phrase_score_groupkey.flatMap(extract_cand_phrase_top4)
    print_take_1(cand_phrases_top4, '💎 Top 4 keyword:    within a document')

    # (clause_number, cand_phrase, score) -> (phrase:str, clause_number)
    cand_phrases_duplicate = cand_phrase_score.map(lambda x: (' '.join(x[1]), x[0]))
    rdf = cand_phrases_duplicate.groupByKey().map(calculate_rdf)
    print_take_1(rdf, 'group by phrases and counting the unique doc id for later calculate rdf')

    # (phrase:str, clause_number) -> (phrase:str, -1)
    cand_phrases_unique = cand_phrases_duplicate.keys().distinct().map(lambda x: (x, -1))
    print_take_1(cand_phrases_unique, 'Get all unique candidate phrases, -1 as dummpy to form key value pair')

    edf_join = cand_phrases_top4.join(cand_phrases_unique).groupByKey().mapValues(list)
    print_take_1(edf_join, 'For later edf calculation, join the top 4 phrase list with the unqiue phrase, then group by phrase')

    # (phrase:str, (rdf, 'rdf')
    edf = edf_join.map(lambda x: (x[0], (len(x[1]), 'edf')))
    # (phrase, ( (edf, 'edf'), (rdf, 'rdf') ) )

    ess_pre = edf.join(rdf)
    print_take_1(ess_pre, 'The input data-ready for calculating ess')

    # (phrase, rdf, edf, ess) 
    ess = ess_pre.map(calculate_ess)
    print_take_1(ess, '💎 Essentiality Score Calculation:   Desire output format')

    # Top 20 for each corpus
    print(' Final Results: Final Sorted Output Result based on ess value from largest to smallest:')
    res = ess.sortBy(lambda r: r[3],ascending=False).take(20)
    print(*res, sep='\n')

### governing law

In [12]:
gov = 'Governing Law'
con = 'Change of Control'
assign = 'Anti-assignment'

control = assignment.filter(lambda x: filter_col(x, con))
assignment = assignment.filter(lambda x: filter_col(x, assign))
count_print_1 = 1

print('Method 1 Keyword extraction result:')
print('Top 20 keyword for: ' + gov)
print()
rake_1_pipeline(governing, gov)

Method 1 Keyword extraction result:
Top 20 keyword for: Governing Law

💎 1:  Make row iterable for later assigning unique id: 

💎 2:  Loop through row and assign unique id: 
[(0, 'This Agreement is accepted by Company in the State of Nevada and shall be governed by and construed in accordance with the laws thereof, which laws shall prevail in the event of any conflict. (Page 13)')]

💎 3:  Split clauses within a cell: 
[('0-0', ['this', 'agreement', 'is', 'accepted', 'by', 'company', 'in', 'the', 'state', 'of', 'nevada', 'and', 'shall', 'be', 'governed', 'by', 'and', 'construed', 'in', 'accordance', 'with', 'the', 'laws', 'thereof', 'which', 'laws', 'shall', 'prevail', 'in', 'the', 'event', 'of', 'any', 'conflict'])]

💎 4:  💎 Candidate Phrase:    Given a document, obtain calculation required data:  (clause_number, content_word_seq, content_words, word_freq): 
[('0-0', [['agreement'], ['accepted'], ['company'], ['state'], ['nevada'], ['governed'], ['construed'], ['accordance'], ['laws'],

### Change of Control

In [13]:
print('Method 1 Keyword extraction result:')
print()
print('Top 20 keyword for: ' + con)
count_print_1 = 1

rake_1_pipeline(control, con)

Method 1 Keyword extraction result:

Top 20 keyword for: Change of Control
💎 1:  Make row iterable for later assigning unique id: 

💎 2:  Loop through row and assign unique id: 
[(0, "For purposes of the preceding sentence, and without limiting its generality, any merger, consolidation or reorganization involving Licensee (regardless of whether Licensee is a surviving or disappearing entity) will be deemed to be a transfer of rights, obligations or performance under this Agreement for which Licensor's prior written consent is required. (Page 15)")]

💎 3:  Split clauses within a cell: 
[('0-0', ['for', 'purposes', 'of', 'the', 'preceding', 'sentence', 'and', 'without', 'limiting', 'its', 'generality', 'any', 'merger', 'consolidation', 'or', 'reorganization', 'involving', 'licensee', 'regardless', 'of', 'whether', 'licensee', 'is', 'surviving', 'or', 'disappearing', 'entity', 'will', 'be', 'deemed', 'to', 'be', 'transfer', 'of', 'rights', 'obligations', 'or', 'performance', 'under', 'thi

### anti-assignment

In [14]:
print('Method 1 Keyword extraction result:')
print('Top 20 keyword for: ' + assign)
print()
count_print_1 = 1

rake_1_pipeline(assignment, assign)

Method 1 Keyword extraction result:
Top 20 keyword for: Anti-assignment

💎 1:  Make row iterable for later assigning unique id: 

💎 2:  Loop through row and assign unique id: 
[(0, 'MA may not assign, sell, lease or otherwise transfer in whole or in party any of the rights granted pursuant to this Agreement without prior written approval of Company. (Page 12)')]

💎 3:  Split clauses within a cell: 
[('0-0', ['ma', 'may', 'not', 'assign', 'sell', 'lease', 'or', 'otherwise', 'transfer', 'in', 'whole', 'or', 'in', 'party', 'any', 'of', 'the', 'rights', 'granted', 'pursuant', 'to', 'this', 'agreement', 'without', 'prior', 'written', 'approval', 'of', 'company'])]

💎 4:  💎 Candidate Phrase:    Given a document, obtain calculation required data:  (clause_number, content_word_seq, content_words, word_freq): 
[('0-0', [['assign', 'sell', 'lease'], ['otherwise', 'transfer'], ['whole'], ['party'], ['rights', 'granted', 'pursuant'], ['agreement'], ['prior', 'written', 'approval'], ['company']], [

# Method 2

## UDF

### preprocessing

In [15]:
def preprocess(row, col:str) -> list:    
    """
    Process the data: lowercase. tokenisation, remove page number
    Args:
        row(spark.csv): raw csv input
    Returns:
        res(List[List[str]]): a list of tokenens
    assume page pattern is (page x)
    """
    row_dict = row.asDict()
    cell = row_dict[col]
    tokenised = word_tokenize(cell)
    # convert all to lower case
    tokenised = [w.lower() for w in tokenised]
    # e.g. conflicts -> conflict
    # tokenised = [lemmatizer.lemmatize(w) for w in tokenised]
    cur_page_idx = 0
    res = []
    for idx, w in enumerate(tokenised):
        # page followed by number pattern
        
        if w in ['page', 'pages']: 
            try:
                # next word first letter if digit
                if tokenised[idx+1][0].isnumeric(): 
                    res += tokenised[cur_page_idx:idx - 1]
                    cur_page_idx = idx + 3 
            except:
                pass
    return res

### co-occurance

In [16]:
def get_co_occurence(candidate_phrase: list) -> list:
    """
    Calculate co occurrence given a candidate phrase
    Args:
        candidate_phrase(List[str]): list of tokens
    Returns:
        res(List[List[str]]): (word, co_occurance)
    """

    cleaned = [w for w in candidate_phrase if w not in delimiters]
    content_words = set(cleaned)

    res = []
    # cooccurance for each word
    for w in content_words:
        co_occurance = 0
        cand_freq = dict(Counter(candidate_phrase))
        # for a phrase, co occurance with other words = occurance of all word - occurance of the current word 
        co_occurance = len(candidate_phrase) - cand_freq[w]
        res.append((w, co_occurance))
    return res

### get_cand_phrases_list

In [17]:
def get_cand_phrases_list(tokenised: list) -> list:
    """
    Given a list of tokens, get candidate phrases
    Args:
        tokenised(List[str]): list of tokens
    Returns:
        res(tuple): (phrase, -1)
    """
    max_seq_len = 4
    # candidate phrase
    cand_phrases = [[],] 
    for w in tokenised:
        if w not in sww and w not in puncs: # TODO: now assume the last char is the puncation, loop back to find all punct
            cand_phrases[-1].append(w)

        else: # Here see a delimeter, and thus start new list
            
            last_seq = cand_phrases[-1]
            
            if last_seq: # if last sequence not empty
                
                is_within_len = len(last_seq) <= max_seq_len
                
                if not is_within_len: # all(is_unique_seq, conditions): # if any of them is false
                    del cand_phrases[-1]
                                
                # start a new list to collect the next seq
                cand_phrases.append([]) 

    # remove the last seq if it is empty           
    if not cand_phrases[-1]:
        cand_phrases = cand_phrases[:-1]
    # add dummpy
    res = []
    for c in cand_phrases:
        res.append((' '.join(c), -1))
    return res

### get_freq

In [18]:
def get_freq(tokenised) -> list:
    """
    Given a list of tokens, get frequency for each unique word
    Args:
        tokenised(List[str]): list of tokens
    Returns:
        word_freq(list): (word, freq)
    """
    #  number of times a word occurs in the document
    word_freq = list(Counter(tokenised).items())
    return word_freq

### calculate_score_2

In [19]:
def calculate_score_2(data_) -> tuple:
    """
    Calculate score based on rake formula 
    Args:
        tokenised(List[str]): (word, (freq, co-occurrence))
    Returns:
        score(tuple): (word, freq)
    """
    word, params = data_
    freq, co = params
    degree = freq + co
    score = degree / freq
    return (word, score)

### assign_phrase_id

In [20]:
def assign_phrase_id(data_):
    """
    assign phrase as a unqiue id to each word
    Args:
        data_(List[List[str]]): unqiue phrase list
    Returns:
        score(tuple): (word, freq)
    """
    _, unique_phrase_list = data_
    res = []
    for phrase in unique_phrase_list:
        for word in phrase:
            res.append((word, ' '.join(phrase))) # e.g. recognise which word is in which phrase then later on when unpack can join it back
    return res

### get_unique_score_4_phrase

In [21]:
def get_unique_score_4_phrase(data_):
    """
    calculate score for each phrase by adding up the score of its unique word
    Args:
        data_(List[tuple]): unqiue phrase list
    Returns:
        res(tuple): (phrase, phrase_score)

    If duplicate word occur in phrase, only one will be counted
    """
    phrase, list_ = data_
    word_list = []
    phrase_score = 0
    for tuple_ in list_:
        word_score, word = tuple_
        if word not in word_list:
            word_list.append(word)
            phrase_score += word_score
    return (phrase, phrase_score)

## Main Flow

In [22]:
def filter_col(row, col:str) -> bool:
    """check if col name exists"""
    return col in list(row.asDict().keys())

def print_take_1(rdd_, descriptor=''):
    global count_print_1
    p = rdd_.take(1)
    print(f'💎 {count_print_1}: ', descriptor + ': ')
    print(p, sep='\n')
    print()
    count_print_1 += 1

def rake_2_pipeline(data_, col):
    clean = data_.filter(lambda row: row.asDict()[col] != 'nan' and row.asDict()[col])
    clean = clean.map(lambda x: preprocess(x, col))
    print_take_1(clean, 'Processed input')

    # get candidate phrase
    #(phrase, -1)
    cand_phrases_duplicate = clean.flatMap(get_cand_phrases_list)
    print_take_1(cand_phrases_duplicate, '💎 Candidate Phrase:    Create a list of candidate phrases for each row then flatten them')
    
    # (word, freq)
    # sum up the freq for each word
    freq = clean.flatMap(get_freq).reduceByKey(lambda a, b: a + b) 
    print_take_1(freq, 'Given a phrase, get frequency for each word')

    # (word, co occurance )
    # sum up the co occurance for each word
    cand_phrases_co_occurance = cand_phrases_duplicate.flatMap(get_co_occurence).reduceByKey(lambda a, b: a + b) #.groupByKey().mapValues(list).map(lambda x: (x[0], len(x[1])))
    print_take_1(cand_phrases_co_occurance, 'Given a phrase, get co occurance for each word')

    # (word, (freq, co_occurance))
    score_join = freq.join(cand_phrases_co_occurance)
    print_take_1(score_join, 'Join frequency and co-occurrence for later score calculation')

    # (word, score)
    goven_score_res = score_join.map(calculate_score_2)
    print_take_1(goven_score_res, '💎 Word Score:   Calculate the score for each word')

    # (phrase, -1)
    cand_phrases_unique = cand_phrases_duplicate.groupByKey().keys().distinct().map(lambda x: (x, -1))
    print_take_1(cand_phrases_unique, '💎 candidate phrase: Get unqiue candidate phrase')

    # [str], map phrase from str -> list
    cand_phrases_unique = cand_phrases_unique.map(lambda x: x[0].split(' '))

    goven_phrase_iterable = cand_phrases_unique.map(lambda x: (1, [x])).reduceByKey(lambda a, b: a + b) 
    print_take_1(goven_phrase_iterable, 'Make the phrase iterable by adding elements to a list with a dummy key')

    #  (word, phrase)
    goven_phrase_unpacked = goven_phrase_iterable.flatMap(assign_phrase_id)
    print_take_1(goven_phrase_unpacked, 'assign word with id, the id is the phrase it belongs to')

    # (word, (phrase, 2.0)) -- map -> (phrase, (score, word))
    goven_phrase_join_score = goven_phrase_unpacked.join(goven_score_res).map(lambda x: ( x[1][0], (x[1][1], x[0]) ) )
    print_take_1(goven_phrase_join_score, 'Join the (word, score) with (word, phrase) -> (phrase, (score, word))')

    # (phrase, (score, word)) ->  (phrase, [(score, word), ...]) 
    phrase_score = goven_phrase_join_score.groupByKey().mapValues(list).map(get_unique_score_4_phrase) # .reduceByKey(lambda a, b: a + b).map(lambda x:(x[0],round(x[1], 5))) # TODO:remove duplicate word score for phrase
    print_take_1(phrase_score, '💎 Candidate keyword Score:   get phrase score by adding up word score, only unique word in phrase are added')
    
    print('💎 Final Results:  Sorted Output Result based on ess value from largest to smallest:')
    print(*phrase_score.sortBy(lambda r: r[1],ascending=False).take(20), sep='\n')

### governing law

In [23]:
gov = 'Governing Law'
con = 'Change of Control'
assign = 'Anti-assignment'

control = assignment.filter(lambda x: filter_col(x, con))
assignment = assignment.filter(lambda x: filter_col(x, assign))

count_print_1 = 1

print('Method 1 Keyword extraction result:')
print('Top 20 keyword for: ' + gov)
print()
rake_2_pipeline(governing, gov)

Method 1 Keyword extraction result:
Top 20 keyword for: Governing Law

💎 1:  Processed input: 
[['this', 'agreement', 'is', 'accepted', 'by', 'company', 'in', 'the', 'state', 'of', 'nevada', 'and', 'shall', 'be', 'governed', 'by', 'and', 'construed', 'in', 'accordance', 'with', 'the', 'laws', 'thereof', ',', 'which', 'laws', 'shall', 'prevail', 'in', 'the', 'event', 'of', 'any', 'conflict', '.']]

💎 2:  💎 Candidate Phrase:    Create a list of candidate phrases for each row then flatten them: 
[('agreement', -1)]

💎 3:  Given a phrase, get frequency for each word: 
[('this', 489)]

💎 4:  Given a phrase, get co occurance for each word: 
[('agreement', 448)]

💎 5:  Join frequency and co-occurrence for later score calculation: 
[('agreement', (477, 448))]

💎 6:  💎 Word Score:   Calculate the score for each word: 
[('agreement', 1.9392033542976939)]

💎 7:  💎 candidate phrase: Get unqiue candidate phrase: 
[('agreement', -1)]

💎 8:  Make the phrase iterable by adding elements to a list with 

### Change of Control

In [24]:
print('Method 1 Keyword extraction result:')
print()
print('Top 20 keyword for: ' + con)
count_print_1 = 1

rake_2_pipeline(control, con)

Method 1 Keyword extraction result:

Top 20 keyword for: Change of Control
💎 1:  Processed input: 
[['for', 'purposes', 'of', 'the', 'preceding', 'sentence', ',', 'and', 'without', 'limiting', 'its', 'generality', ',', 'any', 'merger', ',', 'consolidation', 'or', 'reorganization', 'involving', 'licensee', '(', 'regardless', 'of', 'whether', 'licensee', 'is', 'a', 'surviving', 'or', 'disappearing', 'entity', ')', 'will', 'be', 'deemed', 'to', 'be', 'a', 'transfer', 'of', 'rights', ',', 'obligations', 'or', 'performance', 'under', 'this', 'agreement', 'for', 'which', 'licensor', "'s", 'prior', 'written', 'consent', 'is', 'required', '.']]

💎 2:  💎 Candidate Phrase:    Create a list of candidate phrases for each row then flatten them: 
[('purposes', -1)]

💎 3:  Given a phrase, get frequency for each word: 
[('for', 72)]

💎 4:  Given a phrase, get co occurance for each word: 
[('purposes', 12)]

💎 5:  Join frequency and co-occurrence for later score calculation: 
[('purposes', (12, 12))]



### anti-assignment

In [25]:
print('Method 1 Keyword extraction result:')
print('Top 20 keyword for: ' + assign)
print()
count_print_1 = 1

rake_2_pipeline(assignment, assign)

Method 1 Keyword extraction result:
Top 20 keyword for: Anti-assignment

💎 1:  Processed input: 
[['ma', 'may', 'not', 'assign', ',', 'sell', ',', 'lease', 'or', 'otherwise', 'transfer', 'in', 'whole', 'or', 'in', 'party', 'any', 'of', 'the', 'rights', 'granted', 'pursuant', 'to', 'this', 'agreement', 'without', 'prior', 'written', 'approval', 'of', 'company', '.']]

💎 2:  💎 Candidate Phrase:    Create a list of candidate phrases for each row then flatten them: 
[('assign', -1)]

💎 3:  Given a phrase, get frequency for each word: 
[('ma', 1)]

💎 4:  Given a phrase, get co occurance for each word: 
[('assign', 399)]

💎 5:  Join frequency and co-occurrence for later score calculation: 
[('assign', (411, 399))]

💎 6:  💎 Word Score:   Calculate the score for each word: 
[('assign', 1.9708029197080292)]

💎 7:  💎 candidate phrase: Get unqiue candidate phrase: 
[('assign', -1)]

💎 8:  Make the phrase iterable by adding elements to a list with a dummy key: 
[(1, [['assign'], ['sell'], ['lease'