In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

# Quiz: Refactoring - Wine Quality

# Quiz: Optimizing - Common Books

We want to find all the coding books published within the last two years

**`with` statement in Python**

* Give access to a file by opening it.
    using open() function: Open returns a file object, which has methods and attributes for getting information about and manipulating the open file.
* `with` statement has better syntax and exceptions handling
    simplified excepttion handling by encapsulating commong preparation and cleanup tasks. In addition, it will automatically close the file. The with statement provides a way for ensuring that a clean-up is always used.

In [2]:
import time

In [3]:
with open("./data/books_published_last_two_years.txt") as file:
    recent_books = file.read().split("\n")
    
with open("./data/all_coding_books.txt") as file: 
    coding_books = file.read().split("\n")

In [4]:
print("Length recent_books:", len(recent_books))
print("Length coding_books:", len(coding_books))

Length recent_books: 24159
Length coding_books: 32250


### 1st method : naive double loop

In [5]:
start = time.time()
recent_coding_books = []

for book in recent_books:
    if book in coding_books:
        recent_coding_books.append(book)
        
end = time.time()
print("Duration: {:.4f}".format(end-start))
print("\nNo. of recent_coding_books:", len(recent_coding_books))

Duration: 15.4991

No. of recent_coding_books: 96


### 2nd method : numpy intersect

Numpy's `intersect1d` method can be used to get the intersection of the `recent_books` and `coding_books`arrays. 

- `intersect1d`: Find the intersection of two arrays. Return the sorted, unique values that are in both of the input arrays. 
```python
>>> numpy.intersect1d([1,2,3], [3,1,1])
array([1,3])
```
- [link](https://docs.scipy.org/doc/numpy/reference/generated/numpy.intersect1d.html)

In [6]:
start = time.time()
recent_coding_books = np.intersect1d(recent_books, coding_books)
end = time.time()
print("Duration: {:.4f}".format(end-start))
print("\nNo. of recent_coding_books:", len(recent_coding_books))

Duration: 0.0431

No. of recent_coding_books: 96


### 3rd method : set intersect

What is a set ? [link](https://docs.python.org/2/library/sets.html)

In [7]:
start = time.time()
recent_coding_books = set(recent_books).intersection(coding_books)
end = time.time()
print("Duration: {:.4f}".format(end-start))
print("\nNo. of recent_coding_books:", len(recent_coding_books))

Duration: 0.0108

No. of recent_coding_books: 96


In [8]:
# with set conversion
start = time.time()
recent_coding_books = set(recent_books).intersection(set(coding_books))
end = time.time()
print("Duration: {:.4f}".format(end-start))
print("\nNo. of recent_coding_books:", len(recent_coding_books))

Duration: 0.0106

No. of recent_coding_books: 96


__Note__: We do not need to convert the 2nd list to a set and doing it actually makes the computation longer 

# Quiz: Optimizing - Holiday Gifts

- Using vectorized operations and more efficient data structures can optimize the code significantly. 

We'll use this for another example. 

- One million users have listed a gift on a wish list. 
- Prices: `gift_costs.txt`
- Give each customer gift for free if it is under 25 dollars. 
- Calculate total costs of all gifts under 25 dollars to see total costs.

General notes:

- Check type of your data
- What type of data do you want? In general, numpy arrays are nice to work with and they are fast. 

### Set data type

In [9]:
# load data
with open("./data/gift_costs.txt") as f:
    gift_costs_raw = f.read().split("\n")

In [10]:
print("Costs are stored as " + str(type(gift_costs_raw[0])))

Costs are stored as <class 'str'>


In [11]:
# We convert the costs as integer
gift_costs = np.array(gift_costs_raw).astype(int)
print("Costs are stored as " + str(type(gift_costs[0])))

Costs are stored as <class 'numpy.int64'>


### First Method

In [None]:
# first method
start = time.time()

total_price = 0

for cost in gift_costs:
    if cost < 25:
        total_price += cost*1.08 # cost after tax

end = time.time()

print(round(total_price,2))
print("Duration: {:.4f} seconds".format(end-start))

### 2nd Method : conditionnal numpy

In [13]:
# 2nd method
start = time.time()

total_price = sum(gift_costs[gift_costs<25])*1.08

end = time.time()

print(round(total_price,2))
print("Duration: {:.4f} seconds".format(end-start))

32765421.24
Duration: 0.6012 seconds


### 3rd Method : conditionnal numpy + `.sum()` method

In [14]:
# 3rd method : conditionnal numpy
start = time.time()

total_price = gift_costs[gift_costs<25].sum()*1.08

end = time.time()

print(round(total_price,2))
print("Duration: {:.4f} seconds".format(end-start))

32765421.24
Duration: 0.0701 seconds
