In [51]:
# import libraries
import pandas as pd 
from itertools import product
import numpy as np

A= ['carrier', 'origin', 'destination'] 
M= ['arrivaldelay', 'departurdelay', 'weatherdelay']
F= ['SUM', 'AVG', 'MIN', 'MAX', 'STD']

df = pd.DataFrame(list(product(A,M,F)), columns=['Attributes', 'Meassure', 'Function'])
df['Utility'] = np.random.uniform(0, 0.5, df.shape[0])
df.head()

Unnamed: 0,Attributes,Meassure,Function,Utility
0,carrier,arrivaldelay,SUM,0.387516
1,carrier,arrivaldelay,AVG,0.483967
2,carrier,arrivaldelay,MIN,0.34517
3,carrier,arrivaldelay,MAX,0.353826
4,carrier,arrivaldelay,STD,0.141805


In [85]:
# Diversity functions
def check_attribute(attr):
    time_att = ['year','quarter','month','week','day']
    loc_att = ['country','state','region','city']
    product_att = ['category','subcategory','product_name']
    
    if attr in time_att:
        return 1
    elif attr in loc_att:
        return 2
    elif attr in product_att:
        return 3
    else:
        return 0

def time_hierarchy(at1, at2):
    time_att = ['year','quarter','month','week','day']
    val = [1,2,3,4,5]
    dict_time = dict(zip(time_att,val))
    d = abs(dict_time[at1] - dict_time[at2])
    return float(d)/len(time_att)
    
def loc_hierarchy(at1, at2):
    loc_att = ['country','state','region','city']
    val = [1,2,3,4]
    dict_loc = dict(zip(loc_att,val))
    d = abs(dict_loc[at1] - dict_loc[at2])
    return float(d)/len(loc_att)

def product_hierarchy(at1, at2):
    product_att = ['category','subcategory','product_name']
    val = [1,2,3]
    dict_prod = dict(zip(product_att,val))
    d = abs(dict_prod[at1] - dict_prod[at2])
    return float(d)/len(product_att)

def diversity(a,b):
    la = list(a)
    lb = list(b)
    v1 = get_value_d(la[0],lb[0])
    v2 = get_value_d(la[1],lb[1])
    v3 = get_value_d(la[2],lb[2])
    res = float(((v1*3)+(v2*2)+(v3*1)))/3
    return res
    
def get_value_d(attr1, attr2):
    d = None 
    if check_attribute(attr1) != 0:
        if check_attribute(attr1) == check_attribute(attr2):
            if check_attribute(attr1) == 1:
                d = time_hierarchy(attr1, attr2)
            elif check_attribute(attr1) == 2:
                d = loc_hierarchy(attr1, attr2)
            elif check_attribute(attr1) == 3:
                d = product_hierarchy(attr1, attr2)
            else:
                pass
        else:
            if (attr1) == (attr2):
                d = 0.0
            else:
                d = 1.0
    else:
        if (attr1) == (attr2):
                d = 0.0
        else:
            d = 1.0
    return d

def most_distant_two_views(data):
    max_distance = 0
    result = []
    for i in range(0,len(data)):
        for j in range(0,len(data)):
            d = diversity(data[i],data[j])
            if d > max_distance:
                max_distance = d
                result = [data[i],data[j]]
    return result

def div_compute(item, S_list):
    S_new = S_list.copy()
    S_new.append(item)
    k = len(S_new)
    new_series = pd.Series(S_new)
    series_set = new_series.apply(lambda row: set(row))
    new_df = series_set.apply(lambda a: series_set.apply(lambda b: diversity(a,b)))
    new_df['tot'] = new_df.sum(axis=1)
    div = (sum(new_df['tot'])/2)/(k*(k-1))
    return div

def max_compute(div, tradeoff=0.9):
    util = 1.4
    objf = ((1-tradeoff)*util)+(tradeoff*div)
    return objf

def min_compute(div, tradeoff=0.9):
    util = 0.0
    objf = ((1-tradeoff)*util)+(tradeoff*div)
    return objf
    

# Synthetic Dataset

In [86]:
# Synthetic dataset
df.head()

Unnamed: 0,Attributes,Meassure,Function,Utility
0,order_date,profit,avg,0.547994
1,postal_code,profit,avg,0.542089
2,order_date,profit,sum,0.536141
3,ship_date,profit,sum,0.529703
4,customer_name,profit,sum,0.503661


In [87]:
df_greedy = df.drop(['Utility'],axis=1)
dataset = df_greedy.reset_index(drop=True)
data = dataset.values.tolist()
X = data.copy()
S = most_distant_two_views(data)
X.remove(S[0])
X.remove(S[1])
i = len(S)

In [88]:
X[0] #Example of X

['postal_code', 'profit', 'avg']

In [89]:
S # two most distant views

[['order_date', 'profit', 'avg'], ['ship_mode', 'sales', 'sum']]

### First iteration to get the third point

In [90]:
#Creating dataframe of X 
x_df = pd.DataFrame(X, columns=['Attributes', 'Meassure', 'Function']) 

In [91]:
# Calculate diversity value of each X to current set S
x_df['div'] = x_df[['Attributes', 'Meassure', 'Function']].apply(lambda row: div_compute(row,S), axis=1)

In [92]:
# Calculate max and min values of each X
x_df['max'] = x_df['div'].apply(lambda row: max_compute(row))
x_df['min'] = x_df['div'].apply(lambda row: min_compute(row))

In [93]:
x_df.head(8)

Unnamed: 0,Attributes,Meassure,Function,div,max,min
0,postal_code,profit,avg,1.0,1.04,0.9
1,order_date,profit,sum,0.777778,0.84,0.7
2,ship_date,profit,sum,0.944444,0.99,0.85
3,customer_name,profit,sum,0.944444,0.99,0.85
4,ship_date,profit,avg,1.0,1.04,0.9
5,customer_name,profit,avg,0.944444,0.99,0.85
6,city,profit,avg,1.0,1.04,0.9
7,product_name,profit,avg,0.888889,0.94,0.8


In [94]:
# is there any diversity value which has minimum value of MAX less than maximum value of MIN? 

In [95]:
# Minimum value of Max
min_max = min(x_df['max'])

In [96]:
# Maximum value of Min
max_min = max(x_df['min'])

In [97]:
new_df = x_df[x_df['max'] < max(x_df['min'])]
len(new_df)

26

# Flights Dataset

In [98]:
xl = pd.ExcelFile("flights_dataset.xlsx")
df = xl.parse("Sheet1", header=0)

In [99]:
df_greedy = df.drop(['Utility'],axis=1)
dataset = df_greedy.reset_index(drop=True)
data = dataset.values.tolist()
X = data.copy()
S = most_distant_two_views(data)
X.remove(S[0])
X.remove(S[1])
i = len(S)

In [100]:
X[0] #Example of X

['carrier', 'departuredelay', 'avg']

In [101]:
S # two most distant views

[['carrier', 'weatherdelay', 'max'], ['origin', 'departuredelay', 'avg']]

In [102]:
#Creating dataframe of X 
x_df = pd.DataFrame(X, columns=['Attributes', 'Meassure', 'Function']) 
# Calculate diversity value of each X to current set S
x_df['div'] = x_df[['Attributes', 'Meassure', 'Function']].apply(lambda row: div_compute(row,S), axis=1)
# Calculate max and min values of each X
x_df['max'] = x_df['div'].apply(lambda row: max_compute(row))
x_df['min'] = x_df['div'].apply(lambda row: min_compute(row))
x_df.head(8)

Unnamed: 0,Attributes,Meassure,Function,div,max,min
0,carrier,departuredelay,avg,0.833333,0.89,0.75
1,carrier,weatherdelay,avg,0.666667,0.74,0.6
2,carrier,distance,avg,0.833333,0.89,0.75
3,carrier,departuredelay,max,1.0,1.04,0.9
4,carrier,arrivaldelay,max,1.0,1.04,0.9
5,carrier,arrivaldelay,avg,0.833333,0.89,0.75
6,carrier,distance,max,0.833333,0.89,0.75
7,carrier,distance,sum,0.888889,0.94,0.8


In [103]:
# is there any rows that minimum value of MAX less than maximum value of MIN? 
len(x_df)

82

In [104]:
# Minimum value of Max
min(x_df['max'])

0.73999999999999999

In [105]:
# Maximum value of Min
max(x_df['min'])

0.90000000000000002

In [106]:
new_df = x_df[x_df['max'] < max(x_df['min'])]
len(new_df)

32

In [107]:
# Minimum and Maximum of diversity value 
min(x_df['div']), max(x_df['div'])

(0.66666666666666663, 1.0)

# Superstore Dataset

In [108]:
xl = pd.ExcelFile("superstore_dataset.xlsx")
df = xl.parse("Sheet1", header=0)
df_greedy = df.drop(['Utility'],axis=1)
dataset = df_greedy.reset_index(drop=True)
data = dataset.values.tolist()
X = data.copy()
S = most_distant_two_views(data)
X.remove(S[0])
X.remove(S[1])
i = len(S)

In [109]:
X[0] #Example of X

['postal_code', 'profit', 'avg']

In [110]:
S # two most distant views

[['order_date', 'profit', 'avg'], ['ship_mode', 'sales', 'sum']]

In [111]:
#Creating dataframe of X 
x_df = pd.DataFrame(X, columns=['Attributes', 'Meassure', 'Function']) 
# Calculate diversity value of each X to current set S
x_df['div'] = x_df[['Attributes', 'Meassure', 'Function']].apply(lambda row: div_compute(row,S), axis=1)
# Calculate max and min values of each X
x_df['max'] = x_df['div'].apply(lambda row: max_compute(row))
x_df['min'] = x_df['div'].apply(lambda row: min_compute(row))
x_df.head(8)

Unnamed: 0,Attributes,Meassure,Function,div,max,min
0,postal_code,profit,avg,1.0,1.04,0.9
1,order_date,profit,sum,0.777778,0.84,0.7
2,ship_date,profit,sum,0.944444,0.99,0.85
3,customer_name,profit,sum,0.944444,0.99,0.85
4,ship_date,profit,avg,1.0,1.04,0.9
5,customer_name,profit,avg,0.944444,0.99,0.85
6,city,profit,avg,1.0,1.04,0.9
7,product_name,profit,avg,0.888889,0.94,0.8


In [112]:
# is there any rows that minimum value of MAX less than maximum value of MIN? 

In [113]:
# Minimum value of Max
min(x_df['max'])

0.73999999999999999

In [114]:
len(x_df)

154

In [115]:
# Maximum value of Min
max(x_df['min'])

0.90000000000000002

In [116]:
new_df = x_df[x_df['max'] < max(x_df['min'])]
len(new_df)

26

In [117]:
# Minimum and Maximum of diversity value 
min(x_df['div']), max(x_df['div'])

(0.66666666666666663, 1.0)

# Heart Dataset

In [118]:
xl = pd.ExcelFile("heart_dataset.xlsx")
df = xl.parse("Sheet1", header=0)
df_greedy = df.drop(['Utility'],axis=1)
dataset = df_greedy.reset_index(drop=True)
data = dataset.values.tolist()
X = data.copy()
S = most_distant_two_views(data)
X.remove(S[0])
X.remove(S[1])
i = len(S)

In [119]:
X[0] #Example of X

['thal', 'oldpeak', 'sum']

In [120]:
S # two most distant views

[['cp', 'oldpeak', 'sum'], ['restecg', 'chol', 'max']]

In [121]:
#Creating dataframe of X 
x_df = pd.DataFrame(X, columns=['Attributes', 'Meassure', 'Function']) 
# Calculate diversity value of each X to current set S
x_df['div'] = x_df[['Attributes', 'Meassure', 'Function']].apply(lambda row: div_compute(row,S), axis=1)
# Calculate max and min values of each X
x_df['max'] = x_df['div'].apply(lambda row: max_compute(row))
x_df['min'] = x_df['div'].apply(lambda row: min_compute(row))
x_df.head(8)

Unnamed: 0,Attributes,Meassure,Function,div,max,min
0,thal,oldpeak,sum,0.888889,0.94,0.8
1,thal,restbp,sum,1.0,1.04,0.9
2,thal,thalach,sum,1.0,1.04,0.9
3,thal,chol,sum,1.0,1.04,0.9
4,cp,chol,sum,0.833333,0.89,0.75
5,cp,restbp,sum,0.833333,0.89,0.75
6,thal,age,sum,1.0,1.04,0.9
7,cp,thalach,sum,0.833333,0.89,0.75


In [122]:
# is there any rows that minimum value of MAX less than maximum value of MIN? 
len(x_df)

103

In [123]:
# Minimum value of Max
min(x_df['max'])

0.73999999999999999

In [124]:
# Maximum value of Min
max(x_df['min'])

0.90000000000000002

In [125]:
new_df = x_df[x_df['max'] < max(x_df['min'])]
len(new_df)

22

In [126]:
# Minimum and Maximum of diversity value 
min(x_df['div']), max(x_df['div'])

(0.66666666666666663, 1.0)

In [127]:
len(X)

103