In [141]:
import random

numbers_list = []
last = 0
for i in range(10_000_000):
    last += random.randint(1, 10)
    numbers_list += [last]

numbers_list[0:5]

[2, 11, 14, 22, 30]

In [142]:
import math


def btree_organize(numbers_list):
    if len(numbers_list) < 10:
        return numbers_list 

    middle = numbers_list[math.floor(len(numbers_list) / 2) - 1]
    smaller = [x for x in numbers_list if x < middle]
    bigger = [x for x in numbers_list if x > middle]
    return { "middle": middle, "left": btree_organize(smaller), "right": btree_organize(bigger) }


numbers_btree = btree_organize(numbers_list)
str(numbers_btree)[0:400] + "..."

"{'middle': 27496659, 'left': {'middle': 13749787, 'left': {'middle': 6871657, 'left': {'middle': 3436995, 'left': {'middle': 1719890, 'left': {'middle': 859066, 'left': {'middle': 430153, 'left': {'middle': 215304, 'left': {'middle': 108006, 'left': {'middle': 53983, 'left': {'middle': 26851, 'left': {'middle': 13459, 'left': {'middle': 6658, 'left': {'middle': 3325, 'left': {'middle': 1651, 'left..."

In [145]:
def btree_search(number, numbers_btree, steps=0):
    if 'middle' in numbers_btree:
        if number < numbers_btree['middle']:
            return btree_search(number, numbers_btree['left'], steps + 1)
        elif number > numbers_btree['middle']:
            return btree_search(number, numbers_btree['right'], steps + 1)
        else:
            return { "steps": steps, "found": numbers_btree['middle'] }
    else:
        if number in numbers_btree:
            steps += numbers_btree.index(number) + 1
            return { "steps": steps, "found": number }
        else:
            return { "steps": steps, "found": None }
        
btree_search(4242, numbers_btree)

{'steps': 25, 'found': 4242}

In [146]:
from statistics import mean

sample = random.sample(numbers_list, 1000)

%time steps = [btree_search(s, numbers_btree)['steps'] for s in sample]
print("Avg number of btree steps", mean(steps))
print("\n")
%time indexes = [numbers_list.index(s) for s in sample]
print("Avg number of indexes", mean(indexes))

CPU times: user 33.6 ms, sys: 6.47 ms, total: 40.1 ms
Wall time: 40.9 ms
Avg number of btree steps 23.769


CPU times: user 3min 17s, sys: 1.16 s, total: 3min 18s
Wall time: 3min 26s
Avg number of indexes 5019753.768


In [147]:
import numpy as np
from sklearn.linear_model import LinearRegression

X = np.array(numbers_list).reshape(-1, 1)
y = np.array(range(0, len(numbers_list)))

model = LinearRegression().fit(X, y)
model.score(X, y)

0.9999999912347628

In [150]:
def predict_index(number, model):
    return math.ceil(model.predict([[number]])[0])


print("Predicted index", predict_index(37123789, model), "Actual index", numbers_list.index(37123789))

Predicted index 6751099 Actual index 6751341


In [152]:
def regression_search(number, numbers_list, model, steps=0):
    maximum = len(numbers_list) - 1
    predicted_index = min(predict_index(number, model), maximum)
    steps += 1
    if numbers_list[predicted_index] == number:
        return { "steps": steps, "found": number }
    
    right = 1
    left = 1
    while True:
        if numbers_list[predicted_index + right] == number:
            return { "steps": steps + 1, "found": number }
        elif numbers_list[predicted_index - left] == number:
            return { "steps": steps + 1, "found": number }
        else:
            right += 1
            left += 1
            steps += 2
    
regression_search(37123789, numbers_list, model)

{'steps': 484, 'found': 37123789}

In [153]:
sample = random.sample(numbers_list, 1000)

%time steps = [btree_search(s, numbers_btree)['steps'] for s in sample]
print("Avg number of btree steps", mean(steps))
print("\n")

%time steps = [regression_search(s, numbers_list, model)['steps'] for s in sample]
print("Avg number of regression steps", mean(steps))
print("\n")

%time indexes = [numbers_list.index(s) for s in sample]
print("Avg number of indexes", mean(indexes))

CPU times: user 31.2 ms, sys: 1.96 ms, total: 33.1 ms
Wall time: 36.6 ms
Avg number of btree steps 23.667


CPU times: user 303 ms, sys: 10.1 ms, total: 313 ms
Wall time: 309 ms
Avg number of regression steps 427.863


CPU times: user 3min 27s, sys: 2.18 s, total: 3min 29s
Wall time: 4min 9s
Avg number of indexes 5011172.64


In [184]:
%%bash

pip install pympler hurry.filesize



In [191]:
from pympler import asizeof
from hurry.filesize import size

list_size = asizeof.asizeof(numbers_list)
print("Size of list", size(list_size))
print("Size of btree", size(asizeof.asizeof(numbers_btree)))
print("Size of model", size(asizeof.asizeof(model)))

Size of list 382M
Size of btree 754M
Size of model 1K


# Conclusion

The regression model being used to find an item in the list is still not faster then the b-tree, although it can jump to a position very close to the desired index, it has to walk left or right a little bit until it finds the item

On the other hand, the regression model is not too slow either, 300ms might be acceptable for 10 million items, and it's way lighter (<1K), although you do need the original list weight anyway to find the item (382M), you don't pay the price for the whole b-tree structure (754M)