In [1]:
import pandas as pd
import time
import seaborn as sns
import json
import networkx as nx
from collections import Counter
import numpy as np
import seaborn as sns

### Get the relationship between two subspace

In [2]:
def get_relation_type(s1, s2):
    sbs_list1 = s1.split('_') if type(s1) is str else s1
    sbs_list2 = s2.split('_') if type(s2) is str else s2
    assert len(sbs_list1) == len(sbs_list2)
    length = len(sbs_list1)
    larger = False
    compare_results = [2 if sbs_list1[i] == sbs_list2[i] #左右相等 
                       else 3 if sbs_list1[i] == '*' and sbs_list2[i] != '*'  # 左边是右边的parants
                       else 0 if sbs_list1[i] != '*' and sbs_list2[i] == '*' # 右边是左边的parants
                       else 1 if (sbs_list1[i] != sbs_list2[i]) and (sbs_list1[i] != '*' and sbs_list2[i] != '*')
                       else -1
                       for i in range(0, length)]

    if compare_results.count(3) == 1 and compare_results.count(2) == length-1:
        return '>', compare_results.index(3)
    elif compare_results.count(0) == 1 and compare_results.count(2) == length-1:
        return '<', compare_results.index(0)
    elif compare_results.count(2) == len(sbs_list1):
        return 'same', None
    elif compare_results.count(1) == 1 and compare_results.count(2) == length-1:
        if compare_results[0] == 1:
            return 'temporal', compare_results.index(1)
        else:
            return 'brothers', compare_results.index(1)
    return None, None

def community_detection(edge_list):
    from networkx import community
    G = nx.Graph()
    G.add_edges_from(edge_list)
    communities = community.greedy_modularity_communities(G)
    
    return [list(c) for c in communities]

def subspace_extention(sub_list):
    result_array = []
    for (i, val) in enumerate(sub_list):
        if val != "*":
            sub_copy = sub_list[:]
            sub_copy[i] = "*"
            result_array.append((i, sub_copy))
    return result_array

def get_topic_id(sub_list):
    if type(sub_list) is list:
        return '_'.join(sub_list)
    else:
        sl = sub_list['subs_list']
        return '_'.join(sl)

## Load data

In [5]:
raw_df = pd.read_csv('data/NBA_raw.csv')
name = 'Kobe Bryant'
print('Promotion object: ', name)
obj_df = raw_df[raw_df['name'] == name]

promo_df = pd.read_csv('data/NBA_K.csv')

print('measures: ', promo_df['measure'].unique())
measure = 'PTS'
print('Select measure: ', measure)

attributes = ['year', 'age', 'team_name', 'lg_name', 'pos_name']
print('Attributes: ', attributes)

Promotion object:  Kobe Bryant
measures:  ['AST' 'PTS']
Select measure:  PTS
Attributes:  ['year', 'age', 'team_name', 'lg_name', 'pos_name']


In [6]:
print("Kobe year range: {}-{}: {}".format(obj_df['year'].min(), obj_df['year'].max(), obj_df['year'].max() - obj_df['year'].min()))
print("Kobe team: total: {}, {}".format(len(obj_df['team_name'].unique()), obj_df['team_name'].unique()))
print("Kobe pos name: total: {}, {}".format(len(obj_df['pos_name'].unique()), obj_df['pos_name'].unique()))

promo_df = promo_df[promo_df['measure'] == measure]
promo_df['id'] = promo_df.index
promo_df['subs_list'] = promo_df['subspace'].apply(lambda subspace : subspace.split('_'))
promo_df.loc[:,attributes] = promo_df['subs_list'].to_list()

Kobe year range: 1997-2015: 18
Kobe team: total: 1, ['Los Angeles Lakers']
Kobe pos name: total: 1, ['SG']


### Create the topic graph

In [7]:
topic_summary = {}
topic_map = {}
facts = [json.loads(promo_df.iloc[i].to_json()) for i in range(0, promo_df.shape[0])]

for (i, node) in enumerate(facts):
    subs_list = node['subs_list']
    subspace = node['subspace']
    current_topic = node['subs_list']
    super_subspace = subspace_extention(current_topic)
    
    for (topic_i, topic_L) in super_subspace:
        t_id = get_topic_id(topic_L)
        if t_id not in topic_summary:
            topic_summary[t_id] = {
                'subspace': t_id,
                'breakdown': {}
            }
        if topic_i not in topic_summary[t_id]['breakdown']:
            topic_summary[t_id]['breakdown'][topic_i] = []
        topic_summary[t_id]['breakdown'][topic_i].append(i)
        

topics = list(topic_summary.values())

In [8]:
relations = []
for i in range(0, len(topics)):
    left = topics[i]
    for j in range(i+1, len(topics)):
        right = topics[j]
        result = get_relation_type(left['subspace'], right['subspace'])
        if result[0] is None:
            continue
        relations.append({
            'src': i,
            'dst': j,
            'id': "{}_{}".format(left['subspace'], right['subspace']),
            'type': result[0],
            'ind': result[1]
        })


In [9]:
file_name = 'topic_{}.json'.format(name.split(' ')[0])
with open(file_name, 'w') as output_file:
    import json
    json.dump({'nodes': topics, 'links': relations}, output_file)

## Visualize the topic graph

In [10]:
%%javascript

require.config({
    paths: { 
        d3: 'https://d3js.org/d3.v7.min'
    }
});

require(['d3'], function(d3) {   
    let width = 800, height = 500;
    let svg = d3.select(element.get(0)).append('svg').attr('width', width).attr('height', height);

    d3.json('./topic_Kobe.json', function(d){
    }).then(data=>{
        
        let container = svg.append('g').attr('class', 'container')

        console.log('json tg', data)

        function intern(value) {
            return value !== null && typeof value === "object" ? value.valueOf() : value;
        }
        let links = data['links']
        let nodes = data['nodes']
        const LS = d3.map(links, link=>link.src).map(intern);
        const LT = d3.map(links, link=>link.dst).map(intern);

        links = d3.map(links, (d, i) => ({source: LS[i], target: LT[i]}));
        let loc = d=> d.x !=undefined ?[d.x, d.y]:[0,0]

        const forceNode = d3.forceManyBody()
        forceNode.distanceMax(300)
        forceNode.distanceMin(2)
        forceNode.theta(0.1)

        const forceLink = d3.forceLink(links).distance(2).id(d=>d.index)

        let lines = container.selectAll('line').data(links).enter().append('line')
            .attr('x1', d=>d.source.x).attr('y1', d=>d.source.y).attr('x2', d=>d.target.x).attr('y2', d=>d.target.y)
            .attr('stroke', 'grey').attr('width', 3)

        let circleContainer = container.selectAll('.node').data(nodes).enter().append('g').attr('class', 'node')
            .attr('transform', d=> 'translate(' + loc(d) + ')')

        let maxSize = d3.max(data.nodes, node=>d3.max(Object.values(node.breakdown), b=>b.length))

        console.log('maxSize', maxSize)
        let sizeScale = d3.scaleLinear().domain([1, maxSize]).range([2, 10])

        circleContainer.each(function(d){
            let _container = d3.select(this);
            let circle = _container.append('circle').attr('r', 8).attr('stroke', 'red').attr('fill', 'white')
            circle.append('title').text(d.subspace)
            let objs = []
            for(let key in d.breakdown){
                objs.push({breakdown: key,size: d.breakdown[key].length})
            }
            _container.selectAll('.breakdown').data(objs).enter().append('circle').attr('class', 'breakdown')
                .attr('stroke', 'blue').attr('r', obj=>sizeScale(obj.size)).attr('fill', 'none')
     
               
        })

        let ticked = function(){
            circleContainer.attr('transform', d=> 'translate(' + loc(d) + ')')
            lines.attr('x1', d=>d.source.x).attr('y1', d=>d.source.y).attr('x2', d=>d.target.x).attr('y2', d=>d.target.y)
        }

        const simulation = d3.forceSimulation(nodes)
            .force("link", forceLink)
            .force("charge", forceNode.strength(-80))
            .velocityDecay(0.4)
            .force("center",  d3.forceCenter(width / 2, height / 2))
            .on("tick", ticked);

    })
})

<IPython.core.display.Javascript object>

### Create fact graph

In [11]:
l = len(facts)
fact_relations = []
for i in range(0, l):
    left = facts[i]
    for j in range(i+1, l):
        right = facts[j]
        relation = get_relation_type(left['subspace'], right['subspace'])
        if relation[0] is None:
            continue

        fact_relations.append({
            'src': i,
            'dst': j,
            'id': "{}_{}".format(left['subspace'], right['subspace']),
            'type': result[0],
            'ind': result[1]
        })


In [12]:
file_name = 'fact_{}.json'.format(name.split(' ')[0])
with open(file_name, 'w') as output_file:
    import json
    json.dump({'nodes': facts, 'links': fact_relations}, output_file)

In [13]:
%%javascript

require.config({
    paths: { 
        d3: 'https://d3js.org/d3.v7.min'
    }
});

require(['d3'], function(d3) {   
    let width = 400, height = 400;
    let svg = d3.select(element.get(0)).append('svg').attr('width', width).attr('height', height);

    d3.json('./fact_Kobe.json', function(d){
    }).then(data=>
    {
        let links = data['links']
        let nodes = data['nodes']
        let container = svg.append('g').attr('class', 'container')


        function intern(value) {
            return value !== null && typeof value === "object" ? value.valueOf() : value;
        }

        const LS = d3.map(links, link=>link.src).map(intern);
        const LT = d3.map(links, link=>link.dst).map(intern);

        links = d3.map(links, (d, i) => ({source: LS[i], target: LT[i]}));
        let loc = d=>{
            return d.x !=undefined ?[d.x, d.y]:[0,0]
        }

        const forceNode = d3.forceManyBody()
        forceNode.distanceMax(300)
        forceNode.distanceMin(2)
        forceNode.theta(0.1)

        const forceLink = d3.forceLink(links).distance(2).id(d=>d.index)

        let lines = container.selectAll('line').data(links).enter().append('line')
            .attr('x1', d=>d.source.x).attr('y1', d=>d.source.y).attr('x2', d=>d.target.x).attr('y2', d=>d.target.y)
            .attr('stroke', 'grey').attr('width', 3)

        let circleContainer = container.selectAll('.node').data(nodes).enter().append('g').attr('class', 'node')
            .attr('transform', d=> 'translate(' + loc(d) + ')')
        let circles = circleContainer.append('circle').attr('fill', 'white').attr('stroke', 'red').attr('r', 5)
        circles.append('title').text(d=>d['subspace'])

        let ticked = function(){
            circleContainer.attr('transform', d=> 'translate(' + loc(d) + ')')
            lines.attr('x1', d=>d.source.x).attr('y1', d=>d.source.y).attr('x2', d=>d.target.x).attr('y2', d=>d.target.y)
        }

        const simulation = d3.forceSimulation(nodes)
            .force("link", forceLink)
            .force("charge", forceNode.strength(-80))
            .velocityDecay(0.4)
            .force("center",  d3.forceCenter(width / 2, height / 2))
            .on("tick", ticked);

    })
})

<IPython.core.display.Javascript object>

### Score
The factors needed to be considered:
- Fact score/meta-fact score:
 - (1) Promotiveness; 直接用mining的分数
 - <s>Impact (Meta-fact and fact should be evalulated with same fits-all approach);</s>
 - <s>Measure rank;</s>
 - <s>Fact rank;</s>
- Relation score:
 - (2) Monotonic increasing；
 - (3) Logic connection；
   - temporal brother: 1
   - parents->children: 0.8
   - brothers: 0.3
   - others: 0
- Story logic score:
 - (4) Trade-off between topic diversity and context uniformity
 - (5) Trade-off between coverage and overlap
 - (6) Length score

## Score calculation

In [14]:
test_story_cases = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [15]:
full_df = raw_df.copy()

In [16]:
promo_df

Unnamed: 0,measure,subspace,rank,target rank,total score,rank score,impact score,id,subs_list,year,age,team_name,lg_name,pos_name
80,PTS,*_*_*_NBA_SG,1,1,-2.0,-1,-1.0,80,"[*, *, *, NBA, SG]",*,*,*,NBA,SG
81,PTS,*_*_*_NBA_*,2,3,-3.5,-3,-0.5,81,"[*, *, *, NBA, *]",*,*,*,NBA,*
82,PTS,2007_*_*_NBA_*,3,1,-5.0,-1,-4.0,82,"[2007, *, *, NBA, *]",2007,*,*,NBA,*
83,PTS,2008_*_*_NBA_*,4,1,-5.0,-1,-4.0,83,"[2008, *, *, NBA, *]",2008,*,*,NBA,*
84,PTS,2003_*_*_NBA_*,5,1,-5.0,-1,-4.0,84,"[2003, *, *, NBA, *]",2003,*,*,NBA,*
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,PTS,2000_*_Los Angeles Lakers_NBA_SG,76,1,-257.0,-1,-256.0,155,"[2000, *, Los Angeles Lakers, NBA, SG]",2000,*,Los Angeles Lakers,NBA,SG
156,PTS,1999_*_Los Angeles Lakers_NBA_SG,77,1,-257.0,-1,-256.0,156,"[1999, *, Los Angeles Lakers, NBA, SG]",1999,*,Los Angeles Lakers,NBA,SG
157,PTS,2002_*_Los Angeles Lakers_NBA_SG,78,1,-257.0,-1,-256.0,157,"[2002, *, Los Angeles Lakers, NBA, SG]",2002,*,Los Angeles Lakers,NBA,SG
158,PTS,2003_*_Los Angeles Lakers_NBA_SG,79,1,-257.0,-1,-256.0,158,"[2003, *, Los Angeles Lakers, NBA, SG]",2003,*,Los Angeles Lakers,NBA,SG


In [17]:
# promo_df['p2_score'] = [p2fact(promo_df.iloc[i]) for i in range(0, promo_df.shape[0])]

##### prepropcess NEED promo_df

In [18]:
def get_relation_type(s1, s2):
    sbs_list1 = s1.split('_') if type(s1) is str else s1
    sbs_list2 = s2.split('_') if type(s2) is str else s2
    assert len(sbs_list1) == len(sbs_list2)
    length = len(sbs_list1)
    larger = False
    compare_results = [2 if sbs_list1[i] == sbs_list2[i] #左右相等
                       else 3 if sbs_list1[i] == '*' and sbs_list2[i] != '*'  # 左边是右边的parants
                       else 0 if sbs_list1[i] != '*' and sbs_list2[i] == '*' # 右边是左边的parants
                       else 1 if (sbs_list1[i] != sbs_list2[i]) and (sbs_list1[i] != '*' and sbs_list2[i] != '*')
                       else -1
                       for i in range(0, length)]

    if compare_results.count(3) == 1 and compare_results.count(2) == length-1:
        return '>', compare_results.index(3)
    elif compare_results.count(0) == 1 and compare_results.count(2) == length-1:
        return '<', compare_results.index(0)
    elif compare_results.count(2) == len(sbs_list1):
        return 'same', None
    elif compare_results.count(1) == 1 and compare_results.count(2) == length-1:
        if compare_results[0] == 1:
            return 'temporal', compare_results.index(1)
        else:
            return 'brothers', compare_results.index(1)
    return None, None

class StoryEvaluator:
    def __init__(self, df=None, story_list=None):
        self.promo_df = None
        self.story_list = []
        if df is not None and story_list is not None:
            self.set_df_and_story_list(df, story_list)

    def set_df_and_story_list(self, df, story_list):
        self.promo_df = df
        relative_scores = []
        min_score = min(self.promo_df['total score'])
        max_score = max(self.promo_df['total score'])
        for i in range(0, self.promo_df.shape[0]):
            fact = self.promo_df.iloc[i]
            total_score = fact['total score']
            relative_score = self.promo_df[self.promo_df['total score'] <= total_score].shape[0] / self.promo_df.shape[
                0]
            relative_scores.append(relative_score)
        self.promo_df['relative_score'] = relative_scores
        ## TODO: hard code
        self.promo_df = self.promo_df[self.promo_df['measure'] == 'PTS']
        self.promo_df.loc[:,'id'] = self.promo_df.index
        self.promo_df.loc[:,'subs_list'] = self.promo_df['subspace'].apply(lambda subspace: subspace.split('_'))
        attributes = ['year', 'age', 'team_name', 'lg_name', 'pos_name']
        self.promo_df.loc[:,attributes] = self.promo_df['subs_list'].to_list()



        self.story_list = story_list
        self.story_facts = self.promo_df.iloc[self.story_list]

    def set_story_list(self, story_list):
        self.story_list = story_list
        self.story_facts = self.promo_df.iloc[self.story_list]

    def calc_promotiveness(self):
        ## (1)
        return self.story_facts['relative_score'].sum() / self.story_facts.shape[0]

    def calc_tension_score(self):
        ## (2)
        return 0

    def calc_connection_score(self):
        # (3)
        def relation_scoring(fact1, fact2):
            relation_type = get_relation_type(fact1['subs_list'], fact2['subs_list'])
            if relation_type[0] in ['temporal']:
                return 1
            elif relation_type[0] == 'brothers':
                return 0.8
            elif relation_type[0] == '>':
                return 0.3
            return 0

        relation_score = 0

        for i in range(0, len(self.story_facts) - 1):
            relation_score += relation_scoring(self.story_facts.iloc[i], self.story_facts.iloc[i + 1])
        return relation_score / (1 * (self.story_facts.shape[0] - 1))

    def calc_uniformity(self):
        # (4)
        n_diff = 0
        pre_index = -1
        for i in range(0, len(self.story_list) - 1):
            fact1 = self.promo_df.iloc[i]
            fact2 = self.promo_df.iloc[i + 1]
            (t, current_index) = get_relation_type(fact1['subs_list'], fact2['subs_list'])
            if pre_index == -1:
                pre_index = current_index
            else:
                if current_index is None:
                    n_diff += 1
                elif pre_index != current_index:
                    n_diff += 1
            pre_index = current_index
        return n_diff / (len(self.story_list) - 1)

    def calc_coverage(self):
        # (5)
        return self.story_facts.shape[0] / self.promo_df.shape[0]

    def calc_length_score(self):
        # (6)
        return 0

    def calc_score(self):
        return {
            'promotiveness': self.calc_promotiveness(),
            'tension': self.calc_tension_score(),
            'connection': self.calc_connection_score(),
            'uniformity': self.calc_uniformity(),
            'coverage': self.calc_coverage(),
            'length': self.calc_length_score()
        }

In [24]:
story_evaluator = StoryEvaluator(df=promo_df, story_list=test_story_cases)

In [25]:
story_evaluator.calc_score()

{'promotiveness': 0.9575000000000001,
 'tension': 0,
 'connection': 0.5888888888888889,
 'uniformity': 0.4444444444444444,
 'coverage': 0.125,
 'length': 0}

In [26]:
story_evaluator.calc_tension_score()

0

In [27]:
story_evaluator.calc_connection_score()

0.5888888888888889

In [28]:
relative_scores = []
min_score = min(promo_df['total score'])
max_score = max(promo_df['total score'])
for i in range(0, promo_df.shape[0]):
    fact = promo_df.iloc[i]
    total_score = fact['total score']
    relative_score = promo_df[promo_df['total score'] <= total_score].shape[0] / promo_df.shape[0]
    relative_scores.append(relative_score)
promo_df['relative_score'] = relative_scores

#### (1) fact score is

In [29]:
def get_promotiveness_score(df, story_list):
    test_facts = df.iloc[story_list]
    return test_facts['relative_score'].sum() / test_facts.shape[0]

get_promotiveness_score(promo_df, test_story_cases)

0.9575000000000001

#### (2) tension increasing score

In [30]:
def get_tension_score():
    return 0

#### (3) logic connection score is

In [31]:
def relation_scoring(fact1, fact2):
    relation_type = get_relation_type(fact1['subs_list'], fact2['subs_list'])
    if relation_type[0] in ['temporal']:
        return 1
    elif relation_type[0] == 'brothers':
        return 0.8
    elif relation_type[0] == '>':
        return 0.3
    return 0


relation_score = 0
test_facts = promo_df.iloc[test_story_cases]
for i in range(0, len(test_facts) - 1):
    relation_score += relation_scoring(test_facts.iloc[i], test_facts.iloc[i+1])   
"logic relation score is: {}".format(relation_score)
"avg logic relation score is: {}".format(relation_score / (1 * (test_facts.shape[0] - 1)))

'avg logic relation score is: 0.5888888888888889'

#### (4) Uniformity

In [32]:
n_diff = 0
pre_index = -1
for i in range(0, len(test_story_cases) - 1):
    fact1 = promo_df.iloc[i]
    fact2 = promo_df.iloc[i+1]
    (t, current_index) = get_relation_type(fact1['subs_list'], fact2['subs_list'])
    if pre_index == -1:
        pre_index = current_index
    else:
        if current_index is None:
            n_diff += 1
        elif pre_index != current_index:
            n_diff += 1
    pre_index = current_index
print("No of incorrenct relationship: {}".format(n_diff))   
print("Score of relationship: {}".format(n_diff /  (len(test_story_cases) - 1)))   


No of incorrenct relationship: 4
Score of relationship: 0.4444444444444444


#### (5) Coverage

In [33]:
"coverage score is {}".format(len(set(test_story_cases)) / promo_df.shape[0])

'coverage score is 0.125'

#### (6) Length score 
- Need distribution

In [34]:
print(len(list(set(test_story_cases))))

10
