# Data Science with Python and Dask
## Chapter 9: Working with Bags and Arrays

### Section 9.1

In [1]:
# Listing 9.1
# Import bag and read in the data
import dask.bag as bag
import os

os.chdir('/Users/jesse/Documents')
raw_data = bag.read_text('foods.txt')
raw_data

dask.bag<bag-fro..., npartitions=1>

### Section 9.1.1

In [2]:
# Listing 9.2
# Take a small sample of the first few elements of the bag
raw_data.take(10)

('product/productId: B001E4KFG0\n',
 'review/userId: A3SGXH7AUHU8GW\n',
 'review/profileName: delmartian\n',
 'review/helpfulness: 1/1\n',
 'review/score: 5.0\n',
 'review/time: 1303862400\n',
 'review/summary: Good Quality Dog Food\n',
 'review/text: I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.\n',
 '\n',
 'product/productId: B00813GRG4\n')

### Section 9.1.2

In [3]:
# Listing 9.3
# If we try to count across the file, we might run into an encoding error
raw_data.count().compute()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xce in position 2620: invalid continuation byte

In [4]:
# Listing 9.4
raw_data = bag.read_text('foods.txt', encoding='cp1252')
raw_data.count().compute()

5116093

### Section 9.1.3

In [6]:
# Listing 9.5
from dask.delayed import delayed

def get_next_part(file, start_index, span_index=0, blocksize=1024):
    file.seek(start_index)
    buffer = file.read(blocksize + span_index).decode('cp1252')
    delimiter_position = buffer.find('\n\n')
    if delimiter_position == -1:
        return get_next_part(file, start_index, span_index + blocksize)
    else:
        file.seek(start_index)
        return start_index, delimiter_position

In [8]:
# Listing 9.6
with open('foods.txt', 'rb') as file_handle:
    size = file_handle.seek(0,2) - 1
    more_data = True
    output = []
    current_position = next_position = 0
    while more_data:
        if current_position >= size:
            more_data = False
        else:
            current_position, next_position = get_next_part(file_handle, current_position, 0)
            output.append((current_position, next_position))
            current_position = current_position + next_position + 2

In [7]:
# Listing 9.7
def get_item(filename, start_index, delimiter_position, encoding='cp1252'):
    with open(filename, 'rb') as file_handle:
        file_handle.seek(start_index)
        text = file_handle.read(delimiter_position).decode(encoding)
        elements = text.strip().split('\n')
        key_value_pairs = [(element.split(': ')[0], element.split(': ')[1]) 
                               if len(element.split(': ')) > 1 
                               else ('unknown', element) 
                               for element in elements]
        return dict(key_value_pairs)

In [9]:
# Listing 9.8
reviews = bag.from_sequence(output).map(lambda x: get_item('foods.txt', x[0], x[1]))

In [10]:
# Listing 9.9
reviews.take(2)

({'product/productId': 'B001E4KFG0',
  'review/userId': 'A3SGXH7AUHU8GW',
  'review/profileName': 'delmartian',
  'review/helpfulness': '1/1',
  'review/score': '5.0',
  'review/time': '1303862400',
  'review/summary': 'Good Quality Dog Food',
  'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'},
 {'product/productId': 'B00813GRG4',
  'review/userId': 'A1D87F6ZCVE5NK',
  'review/profileName': 'dll pa',
  'review/helpfulness': '0/0',
  'review/score': '1.0',
  'review/time': '1346976000',
  'review/summary': 'Not as Advertised',
  'review/text': 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".'})

In [11]:
# Listing 9.10
from dask.diagnostics import ProgressBar

with ProgressBar():
    count = reviews.count().compute()
count

[########################################] | 100% Completed |  8.7s


568454

### Section 9.2.1

In [12]:
# Listing 9.11
def get_score(element):
    score_numeric = float(element['review/score'])
    return score_numeric

In [13]:
# Listing 9.12
review_scores = reviews.map(get_score)
review_scores.take(10)

(5.0, 1.0, 4.0, 2.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0)

In [14]:
# Listing 9.13
def tag_positive_negative_by_score(element):
    if float(element['review/score']) > 3:
        element['review/sentiment'] = 'positive'
    else:
        element['review/sentiment'] = 'negative'
    return element

reviews.map(tag_positive_negative_by_score).take(3)

({'product/productId': 'B001E4KFG0',
  'review/userId': 'A3SGXH7AUHU8GW',
  'review/profileName': 'delmartian',
  'review/helpfulness': '1/1',
  'review/score': '5.0',
  'review/time': '1303862400',
  'review/summary': 'Good Quality Dog Food',
  'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
  'review/sentiment': 'positive'},
 {'product/productId': 'B00813GRG4',
  'review/userId': 'A1D87F6ZCVE5NK',
  'review/profileName': 'dll pa',
  'review/helpfulness': '0/0',
  'review/score': '1.0',
  'review/time': '1346976000',
  'review/summary': 'Not as Advertised',
  'review/text': 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "

In [15]:
# Listing 9.14
reviews.take(1)

({'product/productId': 'B001E4KFG0',
  'review/userId': 'A3SGXH7AUHU8GW',
  'review/profileName': 'delmartian',
  'review/helpfulness': '1/1',
  'review/score': '5.0',
  'review/time': '1303862400',
  'review/summary': 'Good Quality Dog Food',
  'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'},)

### Section 9.2.2

In [16]:
# Listing 9.15
specific_item = reviews.filter(lambda element: element['product/productId'] == 'B001E4KFG0')
specific_item.take(5)

  "larger `npartitions` to `take`.".format(n, len(r)))


({'product/productId': 'B001E4KFG0',
  'review/userId': 'A3SGXH7AUHU8GW',
  'review/profileName': 'delmartian',
  'review/helpfulness': '1/1',
  'review/score': '5.0',
  'review/time': '1303862400',
  'review/summary': 'Good Quality Dog Food',
  'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'},)

In [17]:
# Listing 9.16
keyword = reviews.filter(lambda element: 'dog' in element['review/text'])
keyword.take(5)

({'product/productId': 'B001E4KFG0',
  'review/userId': 'A3SGXH7AUHU8GW',
  'review/profileName': 'delmartian',
  'review/helpfulness': '1/1',
  'review/score': '5.0',
  'review/time': '1303862400',
  'review/summary': 'Good Quality Dog Food',
  'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'},
 {'product/productId': 'B00171APVA',
  'review/userId': 'A21BT40VZCCYT4',
  'review/profileName': 'Carol A. Reed',
  'review/helpfulness': '0/0',
  'review/score': '5.0',
  'review/time': '1351209600',
  'review/summary': 'Healthy Dog Food',
  'review/text': 'This is a very healthy dog food. Good for their digestion. Also good for small puppies. My dog eats her required amount at every feeding.'},
 {'product/productId': 'B0019CW0HE',
  'review/userId': 'A1FD9E5C06UB6

In [18]:
# Listing 9.17
def is_helpful(element):
    helpfulness = element['review/helpfulness'].strip().split('/')
    number_of_helpful_votes = float(helpfulness[0])
    number_of_total_votes = float(helpfulness[1])
    # Watch for divide by 0 errors
    if number_of_total_votes >= 1:
        return number_of_helpful_votes / number_of_total_votes > 0.75
    else:
        return False

In [19]:
# Listing 9.18
helpful_reviews = reviews.filter(is_helpful)
helpful_reviews.take(2)

({'product/productId': 'B001E4KFG0',
  'review/userId': 'A3SGXH7AUHU8GW',
  'review/profileName': 'delmartian',
  'review/helpfulness': '1/1',
  'review/score': '5.0',
  'review/time': '1303862400',
  'review/summary': 'Good Quality Dog Food',
  'review/text': 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'},
 {'product/productId': 'B000LQOCH0',
  'review/userId': 'ABXLMWJIXXAIN',
  'review/profileName': 'Natalia Corres "Natalia Corres"',
  'review/helpfulness': '1/1',
  'review/score': '4.0',
  'review/time': '1219017600',
  'review/summary': '"Delight" says it all',
  'review/text': 'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coate

### Section 9.2.3

In [20]:
# Listing 9.19
helpful_review_scores = helpful_reviews.map(get_score)

with ProgressBar():
    all_mean = review_scores.mean().compute()
    helpful_mean = helpful_review_scores.mean().compute()
    
print(f"Mean Score of All Reviews: {round(all_mean, 2)}\nMean Score of Helpful Reviews: {round(helpful_mean,2)}")

In [24]:
# Listing 9.20
def get_length(element):
    return len(element['review/text'])

with ProgressBar():
    review_length_helpful = helpful_reviews.map(get_length).mean().compute()
    review_length_unhelpful = reviews.filter(lambda review: not is_helpful(review)).map(get_length).mean().compute()
print(f"Mean Length of Helpful Reviews: {round(review_length_helpful, 2)}\nMean Length of Unhelpful Reviews: {round(review_length_unhelpful,2)}")

[########################################] | 100% Completed | 10.8s
[########################################] | 100% Completed |  9.9s
Mean Length of Helpful Reviews: 459.36
Mean Length of Unhelpful Reviews: 379.32


### Section 9.2.4

In [22]:
# Listing 9.21
def count(accumulator, element):
    return accumulator + 1

def combine(total1, total2):
    return total1 + total2

with ProgressBar():
    count_of_reviews_by_score = reviews.foldby(get_score, count, 0, combine, 0).compute()
count_of_reviews_by_score

[########################################] | 100% Completed |  9.0s


[(5.0, 363122), (1.0, 52268), (4.0, 80655), (2.0, 29769), (3.0, 42640)]

In [None]:
# Listing 9.22
# Listing 9.21 displays the following output:
# [(5.0, 363122), (1.0, 52268), (4.0, 80655), (2.0, 29769), (3.0, 42640)]

### Section 9.3

In [16]:
# Listing 9.23
def get_score_and_helpfulness(element):
    score_numeric = float(element['review/score'])
    helpfulness = element['review/helpfulness'].strip().split('/')
    number_of_helpful_votes = float(helpfulness[0])
    number_of_total_votes = float(helpfulness[1])
    # Watch for divide by 0 errors
    if number_of_total_votes > 0:
        helpfulness_percent = number_of_helpful_votes / number_of_total_votes
    else:
        helpfulness_percent = 0.
    return (score_numeric, helpfulness_percent)

In [None]:
# Listing 9.24
scores_and_helpfulness = reviews.map(get_score_and_helpfulness).to_dataframe(meta={'Review Scores': float, 'Helpfulness Percent': float})

In [None]:
# Listing 9.25
with ProgressBar():
    scores_and_helpfulness_stats = scores_and_helpfulness.describe().compute()
scores_and_helpfulness_stats

### Section 9.4.2

In [28]:
# Listing 9.26
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer
from functools import partial

tokenizer = RegexpTokenizer(r'\w+')

def extract_reviews(element):
    return element['review/text'].lower()

def filter_stopword(word, stopwords):
    return word not in stopwords

def filter_stopwords(tokens, stopwords):
    return list(filter(partial(filter_stopword, stopwords=stopwords), tokens))

stopword_set = set(stopwords.words('english'))

(['bought',
  'several',
  'vitality',
  'canned',
  'dog',
  'food',
  'products',
  'found',
  'good',
  'quality',
  'product',
  'looks',
  'like',
  'stew',
  'processed',
  'meat',
  'smells',
  'better',
  'labrador',
  'finicky',
  'appreciates',
  'product',
  'better'],)

In [None]:
# Listing 9.27
review_text = reviews.map(extract_reviews)
review_text_tokens = review_text.map(tokenizer.tokenize)
review_text_clean = review_text_tokens.map(partial(filter_stopwords, stopwords=stopword_set))
review_text_clean.take(1)

In [29]:
# Listing 9.28
def make_bigrams(tokens):
    return set(nltk.bigrams(tokens))

review_bigrams = review_text_clean.map(make_bigrams)
review_bigrams.take(2)

({('appreciates', 'product'),
  ('better', 'labrador'),
  ('bought', 'several'),
  ('canned', 'dog'),
  ('dog', 'food'),
  ('finicky', 'appreciates'),
  ('food', 'products'),
  ('found', 'good'),
  ('good', 'quality'),
  ('labrador', 'finicky'),
  ('like', 'stew'),
  ('looks', 'like'),
  ('meat', 'smells'),
  ('processed', 'meat'),
  ('product', 'better'),
  ('product', 'looks'),
  ('products', 'found'),
  ('quality', 'product'),
  ('several', 'vitality'),
  ('smells', 'better'),
  ('stew', 'processed'),
  ('vitality', 'canned')},
 {('actually', 'small'),
  ('arrived', 'labeled'),
  ('error', 'vendor'),
  ('intended', 'represent'),
  ('jumbo', 'salted'),
  ('labeled', 'jumbo'),
  ('peanuts', 'actually'),
  ('peanuts', 'peanuts'),
  ('product', 'arrived'),
  ('product', 'jumbo'),
  ('represent', 'product'),
  ('salted', 'peanuts'),
  ('sized', 'unsalted'),
  ('small', 'sized'),
  ('sure', 'error'),
  ('unsalted', 'sure'),
  ('vendor', 'intended')})

In [30]:
# Listing 9.29
all_bigrams = review_bigrams.flatten()
all_bigrams.take(10)

(('product', 'better'),
 ('finicky', 'appreciates'),
 ('meat', 'smells'),
 ('looks', 'like'),
 ('good', 'quality'),
 ('vitality', 'canned'),
 ('like', 'stew'),
 ('processed', 'meat'),
 ('labrador', 'finicky'),
 ('several', 'vitality'))

In [31]:
# Listing 9.30
with ProgressBar():
    top10_bigrams = all_bigrams.foldby(lambda x: x, count, 0, combine, 0).topk(10, key=lambda x: x[1]).compute()
top10_bigrams

[########################################] | 100% Completed | 11min  7.6s


[(('br', 'br'), 103258),
 (('amazon', 'com'), 15142),
 (('highly', 'recommend'), 14017),
 (('taste', 'like'), 13251),
 (('gluten', 'free'), 11641),
 (('grocery', 'store'), 11627),
 (('k', 'cups'), 11102),
 (('much', 'better'), 10681),
 (('http', 'www'), 10575),
 (('www', 'amazon'), 10517)]

In [32]:
# Listing 9.31
more_stopwords = {'br', 'amazon', 'com', 'http', 'www', 'href', 'gp'}
all_stopwords = stopword_set.union(more_stopwords)

filtered_bigrams = review_text_tokens.map(partial(filter_stopwords, stopwords=all_stopwords)).map(make_bigrams).flatten()

with ProgressBar():
    top10_bigrams = filtered_bigrams.foldby(lambda x: x, count, 0, combine, 0).topk(10, key=lambda x: x[1]).compute()
top10_bigrams

[########################################] | 100% Completed | 11min 19.9s


[(('highly', 'recommend'), 14024),
 (('taste', 'like'), 13343),
 (('gluten', 'free'), 11641),
 (('grocery', 'store'), 11630),
 (('k', 'cups'), 11102),
 (('much', 'better'), 10695),
 (('tastes', 'like'), 10471),
 (('great', 'product'), 9192),
 (('cup', 'coffee'), 8988),
 (('really', 'good'), 8897)]

### Section 9.4.3

In [61]:
# Listing 9.32
negative_review_text = reviews.filter(lambda review: float(review['review/score']) < 3).map(extract_reviews)
negative_review_text_tokens = negative_review_text.map(tokenizer.tokenize)
negative_review_text_clean = negative_review_text_tokens.map(partial(filter_stopwords, stopwords=all_stopwords))
negative_review_bigrams = negative_review_text_clean.map(make_bigrams)
negative_bigrams = negative_review_bigrams.flatten()

with ProgressBar():
    top10_negative_bigrams = negative_bigrams.foldby(lambda x: x, count, 0, combine, 0).topk(10, key=lambda x: x[1]).compute()
top10_negative_bigrams

[########################################] | 100% Completed |  2min 25.9s


[(('taste', 'like'), 3352),
 (('tastes', 'like'), 2858),
 (('waste', 'money'), 2262),
 (('k', 'cups'), 1892),
 (('much', 'better'), 1659),
 (('thought', 'would'), 1604),
 (('tasted', 'like'), 1515),
 (('grocery', 'store'), 1489),
 (('would', 'recommend'), 1445),
 (('taste', 'good'), 1408)]