# Building Fast Queries on a CSV

### Reading in the data

In [46]:
import csv
import time
import random

In [47]:
with open('laptops.csv') as f:
    reader = csv.reader(f)
    rows = list(reader)
    header = rows[0]
    rows = rows[1:]

In [48]:
print(header)
for i in range(5):
    print(rows[i])

['Id', 'Company', 'Product', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price']
['6571244', 'Apple', 'MacBook Pro', 'Ultrabook', '13.3', 'IPS Panel Retina Display 2560x1600', 'Intel Core i5 2.3GHz', '8GB', '128GB SSD', 'Intel Iris Plus Graphics 640', 'macOS', '1.37kg', '1339']
['7287764', 'Apple', 'Macbook Air', 'Ultrabook', '13.3', '1440x900', 'Intel Core i5 1.8GHz', '8GB', '128GB Flash Storage', 'Intel HD Graphics 6000', 'macOS', '1.34kg', '898']
['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', '575']
['9722156', 'Apple', 'MacBook Pro', 'Ultrabook', '15.4', 'IPS Panel Retina Display 2880x1800', 'Intel Core i7 2.7GHz', '16GB', '512GB SSD', 'AMD Radeon Pro 455', 'macOS', '1.83kg', '2537']
['8550527', 'Apple', 'MacBook Pro', 'Ultrabook', '13.3', 'IPS Panel Retina Display 2560x1600', 'Intel Core i5 3.1GHz', '8GB', '256GB SSD',

### Creating an inventory class

In [49]:
class Inventory():

    def __init__(self, csv_filename):
        # Read the CSV file provided in csv_filename
        with open(csv_filename) as f:
            reader = csv.reader(f)
            rows = list(reader)
        self.header = rows[0]
        self.rows = rows[1:]
        # Convert the price of each row to an integer
        for row in self.rows:
            row[-1] = int(row[-1])


In [50]:
# Testing the class
inventory = Inventory('laptops.csv')
print(inventory.header)
print(len(inventory.rows))

['Id', 'Company', 'Product', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price']
1303


### Writing a function to find a laptop by ID

For this section, I'll implement a get_laptop_from_id() function that given a laptop identifier find the row corresponding to that laptop.

In [51]:
class Inventory():

    def __init__(self, csv_filename):
        # Read the CSV file provided in csv_filename
        with open(csv_filename) as f:
            reader = csv.reader(f)
            rows = list(reader)
        self.header = rows[0]
        self.rows = rows[1:]
        # Convert the price of each row to an integer
        for row in self.rows:
            row[-1] = int(row[-1])
        
    # Creating a function for getting a laptop by ID
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None


In [52]:
# Testing the function
inventory = Inventory('laptops.csv')
print(inventory.get_laptop_from_id('3362737'))
print(inventory.get_laptop_from_id('3362736'))

['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', 575]
None


### Implementing a version of the function that is more efficient

Mapping ids to rows in a dictionary to reduce time complexity of the function

In [53]:
class Inventory():

    def __init__(self, csv_filename):
        # Read the CSV file provided in csv_filename
        with open(csv_filename) as f:
            reader = csv.reader(f)
            rows = list(reader)
        self.header = rows[0]
        self.rows = rows[1:]
        # Convert the price of each row to an integer
        for row in self.rows:
            row[-1] = int(row[-1])
        # creating a dictionary with ids as the keys and each row as the values
        self.id_to_row = {}
        for row in self.rows:
            self.id_to_row[row[0]] = row 
    
        
    # Creating a function for getting a laptop by ID
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
    
    # Creating a faster function for getting a laptop by ID
    def get_laptop_from_id_fast(self, laptop_id):
        for row in self.rows:
            if laptop_id in self.id_to_row:
                return self.id_to_row[laptop_id]
        return None

In [54]:
# Testing the function
inventory = Inventory('laptops.csv')
print(inventory.get_laptop_from_id_fast('3362737'))
print(inventory.get_laptop_from_id_fast('3362736'))

['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', 575]
None


### Comparing the performance of the two methods

We will look at how long each method takes to lookup a random set of IDs and compare the total time taken

In [55]:
# Generating a random list of IDs
ids = [str(random.randint(1000000, 9999999)) for _ in range(10000)]
inventory = Inventory('laptops.csv')

In [56]:
# Using the first function
total_time_no_dict = 0

for i in ids:
    start = time.time()
    inventory.get_laptop_from_id(i)
    end = time.time()
    total_time_no_dict += (end-start)

In [57]:
# Using the second function
total_time_dict = 0

for i in ids:
    start = time.time()
    inventory.get_laptop_from_id_fast(i)
    end = time.time()
    total_time_dict += (end-start)

In [58]:
print(total_time_no_dict)
print(total_time_dict)

1.3311679363250732
1.198615312576294


We can see that using the dictionary method reduces the time complexity slightly. If we divide 1.331 by 1.199, there was a 1.11 times improvement on a relatively small dataset. 

## Writing a function to find out whether 2 laptops that add up to a specific amount

In [61]:
class Inventory():

    def __init__(self, csv_filename):
        # Read the CSV file provided in csv_filename
        with open(csv_filename) as f:
            reader = csv.reader(f)
            rows = list(reader)
        self.header = rows[0]
        self.rows = rows[1:]
        # Convert the price of each row to an integer
        for row in self.rows:
            row[-1] = int(row[-1])
        # creating a dictionary with ids as the keys and each row as the values
        self.id_to_row = {}
        for row in self.rows:
            self.id_to_row[row[0]] = row 
    
        
    # Creating a function for getting a laptop by ID
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
    
    # Creating a faster function for getting a laptop by ID
    def get_laptop_from_id_fast(self, laptop_id):
        for row in self.rows:
            if laptop_id in self.id_to_row:
                return self.id_to_row[laptop_id]
        return None
    
    # Creating a function for checking if up to 2 laptops add up to a specific sum
    def check_promotion_dollars(self, dollars):
        for row in self.rows:
            if row[-1] == dollars:
                return True
        for row1 in self.rows:
            for row2 in self.rows:
                if row1[-1] + row2[-1] == dollars:
                    return True
        return False

In [62]:
inventory = Inventory('laptops.csv')
print(inventory.check_promotion_dollars(1000))
print(inventory.check_promotion_dollars(442))

True
False


### Reducing the time complexity of the function

I will do this by storing all prices in a set, which can be searched up with constant time complexity

In [63]:
class Inventory():

    def __init__(self, csv_filename):
        # Read the CSV file provided in csv_filename
        with open(csv_filename) as f:
            reader = csv.reader(f)
            rows = list(reader)
        self.header = rows[0]
        self.rows = rows[1:]
        # Convert the price of each row to an integer
        for row in self.rows:
            row[-1] = int(row[-1])
        # creating a dictionary with ids as the keys and each row as the values
        self.id_to_row = {}
        for row in self.rows:
            self.id_to_row[row[0]] = row 
        # creating a set of prices
        self.prices = set()
        for row in self.rows:
            self.prices.add(row[-1])
        
    # Creating a function for getting a laptop by ID
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
    
    # Creating a faster function for getting a laptop by ID
    def get_laptop_from_id_fast(self, laptop_id):
        for row in self.rows:
            if laptop_id in self.id_to_row:
                return self.id_to_row[laptop_id]
        return None
    
    # Creating a function for checking if up to 2 laptops add up to a specific sum
    def check_promotion_dollars(self, dollars):    
        for row in self.rows:                   
            if row[-1] == dollars:
                return True
        for row1 in self.rows:                  
            for row2 in self.rows:
                if row1[-1] + row2[-1] == dollars:
                    return True
        return False
    
    # Creating a faster function for checking if up to 2 laptops add up to a specific sum
    def check_promotion_dollars_fast(self, dollars):
        if dollars in self.prices:
            return True
        for price in self.prices:
            if dollars - price in self.prices:
                return True
        return False       

In [64]:
inventory = Inventory('laptops.csv')
print(inventory.check_promotion_dollars_fast(1000))
print(inventory.check_promotion_dollars_fast(442))

True
False


### Comparing the performance of the 2 price functions

In [73]:
# Generating a random list of prices
prices = [random.randint(100, 5000) for _ in range(100)]
inventory = Inventory('laptops.csv')

In [74]:
# Using the first function
total_time_no_set = 0

for p in prices:
    start = time.time()
    inventory.check_promotion_dollars(p)
    end = time.time()
    total_time_no_set += (end-start)

In [75]:
# Using the second function
total_time_set = 0

for p in prices:
    start = time.time()
    inventory.check_promotion_dollars_fast(p)
    end = time.time()
    total_time_set += (end-start)

In [76]:
print(total_time_no_set)
print(total_time_set)

1.0748622417449951
0.0005741119384765625


We can see that using a set significantly improves the time complexity of the algorithm. If we divide 1.0749 by 0.0006, it is a 1792 times improvement in the speed.

## Finding all laptops within a customer's budget

In [79]:
def row_price(row):
    return row[-1]

class Inventory():

    def __init__(self, csv_filename):
        # Read the CSV file provided in csv_filename
        with open(csv_filename) as f:
            reader = csv.reader(f)
            rows = list(reader)
        self.header = rows[0]
        self.rows = rows[1:]
        # Convert the price of each row to an integer
        for row in self.rows:
            row[-1] = int(row[-1])
        # creating a dictionary with ids as the keys and each row as the values
        self.id_to_row = {}
        for row in self.rows:
            self.id_to_row[row[0]] = row 
        # creating a set of prices
        self.prices = set()
        for row in self.rows:
            self.prices.add(row[-1])
        # Sorting rows by price
        self.rows_by_price = sorted(self.rows, key=row_price)
        
    # Creating a function for getting a laptop by ID
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
    
    # Creating a faster function for getting a laptop by ID
    def get_laptop_from_id_fast(self, laptop_id):
        for row in self.rows:
            if laptop_id in self.id_to_row:
                return self.id_to_row[laptop_id]
        return None
    
    # Creating a function for checking if up to 2 laptops add up to a specific sum
    def check_promotion_dollars(self, dollars):    
        for row in self.rows:                   
            if row[-1] == dollars:
                return True
        for row1 in self.rows:                  
            for row2 in self.rows:
                if row1[-1] + row2[-1] == dollars:
                    return True
        return False
    
    # Creating a faster function for checking if up to 2 laptops add up to a specific sum
    def check_promotion_dollars_fast(self, dollars):
        if dollars in self.prices:
            return True
        for price in self.prices:
            if dollars - price in self.prices:
                return True
        return False
       
    # Defining a function to find the first laptop that costs > budget
    def find_first_laptop_more_expensive(self, target_price):
        range_start = 0                                   
        range_end = len(self.rows_by_price) - 1                   
        while range_start < range_end:
            range_middle = (range_end + range_start) // 2  
            price = self.rows_by_price[range_middle][-1]
            if price > target_price:
                range_end = range_middle
            else:
                range_start = range_middle + 1
        if self.rows_by_price[range_start][-1] <= target_price:                  
            return -1                                   
        return range_start

In [80]:
inventory = Inventory('laptops.csv')
print(inventory.find_first_laptop_more_expensive(1000))
print(inventory.find_first_laptop_more_expensive(10000))

683
-1


Possible next steps:
- Imagine that we extend our budget query to take as input a range of prices, min_price and max_price, rather than a single price. Write a query that finds all laptops whose price is in the given range.
- Sometimes, a customer wants a laptop with some characteristics such as, for instance, 8GB or RAM and a 256GB hard drive. It would be interesting for those customers to provide a way to find the cheapest laptop that matches the desired characteristics. For simplicity, focus only on the amount of RAM and hard drive capacity. You might need to convert those values to integers rather than using strings.