In [1]:
import random
import string

import numpy as np
import pandas as pd

from IPython.display import display, HTML

In [30]:
random.seed(123)
np.random.seed(42)

# Helper Functions

In [2]:
def prefix_generator(size):
    """
    Helper function to generate a random phone prefix from a set of digits
    """
    return ''.join(random.choices(string.digits, k=size))


# Testing
display(prefix_generator(1))
display(prefix_generator(3))
display(prefix_generator(5))
display(prefix_generator(7))

'5'

'296'

'51868'

'6741102'

In [3]:
def price_list_generator(max_size, max_prefix_size, min_price=0.1, max_price=3.0):
    """
    Helper function to generate a price list with :
        - random size : the list size is randomly selected between 1 and `max_size`
        - random prefixes : each prefix has a length randomly selected between 1 and `max_prefix_size`
        - random prices : to each prefix is matched a price from a uniform random distribution [min_price, max_price]
    """
    return sorted([
        [
            prefix_generator(random.randint(1, max_prefix_size)),
            np.round(random.uniform(min_price, max_price), 2)
        ]
        for _ in range(random.randint(1, max_size))
    ])


# Testing
display(price_list_generator(max_size=3, max_prefix_size=5))
display(price_list_generator(max_size=5, max_prefix_size=5))
display(price_list_generator(max_size=7, max_prefix_size=5))

[['37', 1.17], ['45', 1.18], ['5', 1.62]]

[['19400', 0.62], ['4599', 1.29]]

[['0', 1.21],
 ['1', 0.61],
 ['2', 1.7],
 ['42', 1.09],
 ['639', 1.6],
 ['74272', 1.27],
 ['8818', 2.45]]

In [4]:
def create_prices_dataset(num_operators, max_size, max_prefix_size=5):
    """
    Helper function for creating a dataset holding the operators, the prefixes for
    each operator, and the price for each prefix
    """
    
    # Since operator names are unknown, we create a list of N letters ['A', 'B', ...]
    operators = list(string.ascii_uppercase)[:num_operators]
    
    # Create a DataFrame holding the operator name, the prefixes for that operator, and the price for each prefix
    df_list = []
    
    for op in operators:
        df = pd.DataFrame(price_list_generator(max_size, max_prefix_size), columns=["Prefix", "Price"])
        df["Operator"] = op
        
        df_list.append(df)
        
    # Return a concatenated dataframe
    return pd.concat(df_list)

# Extract Best Prefix Match for each Operator

### Helper Functions

In [5]:
def get_operator_best_match(number, data):
    """
    Helper function to extract the best prefix match
    
    @arg number (string): the number to call
    @arg data (pd.DataFrame): the prices dataset
    
    @return : a pd.DataFrame holding the data (Operator / Prefix that matches best / Price)
    """
    # Get all prefixes that match the number to call
    all_prefixes = data[data.apply(lambda x: number.startswith(x["Prefix"]), axis=1)]
    
    # Find the longest prefix for each operator
    result = {"Operator": [], "Prefix": [], "Price": []}
    
    for op, group in all_prefixes.groupby("Operator"):
        result["Operator"].append(op)
        result["Prefix"].append(group.iloc[-1]["Prefix"])
        result["Price"].append(group.iloc[-1]["Price"])
        
    return pd.DataFrame.from_dict(result)

In [6]:
def cheapest_operator(price_per_operator):
    """
    Helper function to find the cheapest operator for a given number
    
    @arg price_per_operator (pd.DataFrame): a dictionary holding the data (Operator/Prefix that matches best/Price)
    
    @return: the operator name
    """
    
    return price_per_operator.iloc[price_per_operator["Price"].idxmin()]["Operator"]

#### Unit Test

In [7]:
test_df = pd.DataFrame.from_dict({
    "Prefix"   : ["1", "12", "123", "12", "123456", "1", "99"],
    "Price"    : [0.4, 2.9, 1.4, 2.3, 1.6, 1.3, 2.1],
    "Operator" : ["A", "A", "A", "B", "B", "C", "C"]
})

display(test_df)

num = "123456789"
match_df = get_operator_best_match(num, test_df)

display(HTML("<h3>We can see that the function behaves correctly by returning the expected " \
             "values, which are '123' for 'A' and '123456' for 'B' and '1' for 'C'"))
display(match_df)

display(HTML("<h3>Furthermore, the assertion that the cheapest operator is 'C' is true."))
display(HTML("<h6>cheapest_operator(match_df) == 'C'"))

assert(cheapest_operator(match_df) == "C")

Unnamed: 0,Prefix,Price,Operator
0,1,0.4,A
1,12,2.9,A
2,123,1.4,A
3,12,2.3,B
4,123456,1.6,B
5,1,1.3,C
6,99,2.1,C


Unnamed: 0,Operator,Prefix,Price
0,A,123,1.4
1,B,123456,1.6
2,C,1,1.3


# Testing on a Large DataSet

#### Generating random numbers to call

In [31]:
CALL_SIZE = 10     # How many numbers to call
NUM_SIZE  = 10     # The size of each number (e.g. we'd want to call 0123456789, which has a size of 10 digits)

# The function `prefix_generator` comes in very handy to generate random numbers to call
call_list = [prefix_generator(NUM_SIZE) for _ in range(CALL_SIZE)]

print("We'll be calling these numbers :")
display(call_list[:5])

We'll be calling these numbers :


['0041905381', '3320405034', '9017095287', '3826583375', '6668463071']

### Generate a random dataset

In [35]:
MAX_PREFIX_SIZE = 3
NUM_OPERATORS   = 5     # Number of operators
MAX_SIZE        = 100   # Max list size for each operator

price_df = create_prices_dataset(num_operators=NUM_OPERATORS, max_size=MAX_SIZE, max_prefix_size=MAX_PREFIX_SIZE)

# Remove duplicated prefixes
price_df = price_df.drop_duplicates("Prefix")

###### Display a sub-dataset for each operator

In [33]:
for op, prices in price_df.groupby(["Operator"]):
    display(prices.sample(frac=1).head(5).sort_values(by="Prefix"))

Unnamed: 0,Prefix,Price,Operator
0,0,0.29,A
5,2,2.73,A
24,25,0.37,A
37,35,2.1,A
84,83,2.04,A


Unnamed: 0,Prefix,Price,Operator
1,80,1.78,B
7,179,1.84,B
8,26,1.46,B
12,37,2.91,B
21,540,2.26,B


Unnamed: 0,Prefix,Price,Operator
3,229,0.79,C
4,27,0.2,C
15,49,0.33,C
19,71,1.85,C
27,93,2.52,C


Unnamed: 0,Prefix,Price,Operator
2,19,1.62,D
3,831,0.22,D


Unnamed: 0,Prefix,Price,Operator
16,17,1.79,E
49,63,2.86,E
53,686,0.97,E
60,75,2.04,E
73,902,1.73,E


### Adding save / load capabilities

##### Save to CSV format

In [37]:
price_df.to_csv("price-list.csv", header=True, index=False)

##### Load from CSV

In [12]:
price_df = pd.read_csv("price-list.csv", dtype={"Operator": str, "Prefix": str, "Price": np.float})

In [36]:
for number in call_list:
    display(HTML("<h3># The number to call : {}".format(number)))
    display(HTML("<h4>The best matches by operator :"))
    
    match_df = get_operator_best_match(number, price_df)
    display(match_df)
    
    display(HTML("<h3>The cheapest operator is : {}".format(cheapest_operator(match_df))))

Unnamed: 0,Operator,Prefix,Price
0,A,0,2.24
1,B,0,1.7


Unnamed: 0,Operator,Prefix,Price
0,B,3,1.54
1,E,33,1.07


Unnamed: 0,Operator,Prefix,Price
0,B,9,2.05
1,E,90,0.72


Unnamed: 0,Operator,Prefix,Price
0,B,3,1.54


Unnamed: 0,Operator,Prefix,Price
0,A,6,1.18
1,E,66,2.39


Unnamed: 0,Operator,Prefix,Price
0,B,8,0.35


Unnamed: 0,Operator,Prefix,Price
0,B,1,1.47
1,C,10,2.34


Unnamed: 0,Operator,Prefix,Price
0,B,996,1.65


Unnamed: 0,Operator,Prefix,Price
0,B,1,1.47


Unnamed: 0,Operator,Prefix,Price
0,B,0,1.7
1,C,4,0.79
