In [1]:
import math, random, uuid
from math import log as ln
import pandas as pd

#### Project outcomes we wish to model (dependent variables)

In [2]:
outcomes_schema = {
    'project_id': 'unique_id',
    'cost_overrun_absolute':  'continuous',
    'time_overrun_absolute': 'continuous',
    'benefit_deficits': 'continuous'
}

#### Project characteristics that may influence outcomes (independent variables)

In [3]:
characteristics_schema = {
    'original_budget': {'kind': 'continuous zero to infinity', 'weight': 1.0},
    'original_time': {'kind': 'continuous zero to infinity', 'weight': 1.0},
    'country': {'kind': 'discrete', 'weight': 1.0},
    '#risks_in_register': {'kind': 'continuous zero to infinity', 'weight': 1.0},
    'total_risks_amounts': {'kind': 'continuous zero to infinity', 'weight': 1.0},
    '#opportunities_in_register': {'kind': 'continuous zero to infinity', 'weight': 1.0},
    'total_opportunities_amounts': {'kind': 'continuous zero to infinity', 'weight': 1.0},
    'main_contractor': {'kind': 'discrete', 'weight': 1.0},
    'customer': {'kind': 'discrete', 'weight': 1.0},
    'pct_electrical': {'kind': 'continuous zero to one', 'weight': 1.0},
    'pct_mechanical': {'kind': 'continuous zero to one', 'weight': 1.0}
    # etc.
}

We need to transform the characteristics to a quantified form.

We take the log of continuous zero to infinity variables (positive real numbers) to map them onto real numbers.

Discrete variables 'country', 'main_contractor' and 'customer' could become one-hot encoded vectors a.k.a. indicator variables. It might be a nice idea to perform some sophisticated dimensionality reduction on those in future, in order to allow better 'closeness' measures. At the moment, one-hot encoding does not allow better than binary measure of closeness i.e. 'the same' vs. 'not the same'.

#### Define distance measure
Once we have continuous negative-infinity to positive-infinity variables, the closeness of two projects according to that variable is given by the negative exponent of the absolute distance between the variables. This behaves like a dot product, having a range [0, 1]

In [4]:
def characteristic_nearness(x, y, kind):
    """
    Returns a number between zero and one representing how similar two real numbers are.
    Zero means infinitely far away, one means they are the same.
    >>> characteristic_nearness(100, 102, "continuous zero to infinity")
    0.980392156862746
    """
    if kind == 'continuous zero to infinity':
        return math.exp(-abs(math.log(x) - math.log(y)))
    elif kind == 'continuous zero to one':
        return math.exp(-abs(math.tan(x) - math.tan(y)))
    elif kind == 'discrete':
        return 1 if (x==y) else 0
    elif kind == 'minus to plus infinity':
        return math.exp(-abs(x - y))
    else:
        raise NotImplementedError
    return 

x_1 = 100    # Characteristic of project 1
x_2 = 102    # Characteristic of project 2
print(characteristic_nearness(x_1, x_2, 'continuous zero to infinity'))

0.980392156862746


Then we perform a weighted sum of the closeness measure for all the variables, where the weights are given exogenously (for now, but these could be inferred / optimised / learned from other data)

In [5]:
def project_closeness(project_1, project_2, characteristics_schema):
    """ 
    characteristics_schema as implmented just duplicates the project characteristics
    however, having it as an argument allows comparing projects on a subset of available characteristics
    """
    closeness = 0
    tot_weight = 0
    for ch, values in characteristics_schema.items():
        if "value" in project_1[ch]:
            closeness += values["weight"] * characteristic_nearness(project_1[ch]["value"], project_2[ch]["value"], values["kind"])
        tot_weight += values["weight"]
    return (closeness / tot_weight)

#### Create mock historical projects and a candidate 'live' project

In [6]:
# Use a function to generate them
def mock_up_project():
    """
    TODO: improve this to have some patterns rather than purely random
    """
    _successful = random.choice([True, False])
    if _successful:
        fudge = 1
    else:
        fudge = 0
    budg = random.lognormvariate(ln(1), ln(10))
    time = random.lognormvariate(ln(1), ln(10))
    return {
        'original_budget': {'kind': 'continuous zero to infinity', 'weight': 1.0, 'value': budg},
        'original_time': {'kind': 'continuous zero to infinity', 'weight': 1.0, 'value': time},
        'country': {'kind': 'discrete', 'weight': 1.0, 'value': 'GB'},
        '#risks_in_register': {'kind': 'continuous zero to infinity', 'weight': 1.0, 'value': random.lognormvariate(ln(1+fudge), ln(10))},
        'total_risks_amounts': {'kind': 'continuous zero to infinity', 'weight': 1.0, 'value': random.lognormvariate(ln(1), ln(10))},
        '#opportunities_in_register': {'kind': 'continuous zero to infinity', 'weight': 1.0, 'value': random.lognormvariate(ln(1+fudge), ln(10))},
        'total_opportunities_amounts': {'kind': 'continuous zero to infinity', 'weight': 1.0, 'value': random.lognormvariate(ln(1), ln(10))},
        'main_contractor': {'kind': 'discrete', 'weight': 1.0, 'value': random.choice(['XYZ', 'ZYX', 'YZX'])},
        'customer': {'kind': 'discrete', 'weight': 1.0, 'value': random.choice(['ABC', 'CBA', 'BCA'])},
        'pct_electrical': {'kind': 'continuous zero to one', 'weight': 1.0, 'value': random.uniform(0, 1)},
        'pct_mechanical': {'kind': 'continuous zero to one', 'weight': 1.0, 'value': random.uniform(0, 1)},
        # etc.
        'project_id': uuid.uuid4(),
        'cost_overrun_absolute': budg * random.uniform(-0.1*fudge, 2-fudge),
        'time_overrun_absolute': time * random.uniform(-0.1*fudge, 2-fudge),
        'benefit_deficits': -math.exp(random.uniform(0, 1))
    }

In [7]:
projects = []
for i in range(100):
    projects.append(mock_up_project())
live_project = mock_up_project()

#### Function to rank list of projects according to closeness with candidate project

In [8]:
def rank_close_projects(candidate, historic_set, how_many=None, values_only=False):
    newlist = []
    for proj in projects[:how_many]:
        if values_only:
            newproj = {}
            for k, v in proj.items():
                if type(v) is dict:
                    newproj[k] = v.get("value", None)
                else:
                    newproj[k] = v
        else:
            newproj = proj
        newproj["closeness"] = project_closeness(live_project, proj, characteristics_schema)
        newlist.append(newproj)
    return sorted(newlist, key=lambda x: -x["closeness"])

#### Create relevance set of historical projects
- All projects are included
- More relevant projects have greater weight in sample

In [9]:
ranked = rank_close_projects(live_project, projects, values_only=True)
# for p in ranked:
#     print(p["closeness"])

In [12]:
df = pd.DataFrame(ranked)
df.head(3)

Unnamed: 0,#opportunities_in_register,#risks_in_register,benefit_deficits,closeness,cost_overrun_absolute,country,customer,main_contractor,original_budget,original_time,pct_electrical,pct_mechanical,project_id,time_overrun_absolute,total_opportunities_amounts,total_risks_amounts
0,0.464283,8.139406,-1.880157,0.599979,0.336961,GB,BCA,XYZ,0.418868,6.574401,0.253196,0.29904,56191f6e-4448-4ddf-bf83-48dd35528322,12.887219,0.283103,2.586267
1,3.067311,10.425362,-2.478276,0.582955,1.00786,GB,BCA,XYZ,1.143041,7.802482,0.151233,0.428948,482962e9-c516-47ac-b46d-2e3c9f834764,-0.234664,42.09535,0.077519
2,3.749437,20.526532,-1.177634,0.580161,40.603175,GB,BCA,XYZ,41.89843,4.004891,0.15419,0.232062,14ab20e9-f31c-4b65-ac5f-782d20609f8c,2.364141,2.677962,0.003981


In [11]:
df.to_csv("ranked.csv")