#Extract graphs for 2 time period

In [None]:
import requests
import pandas as pd
from collections import OrderedDict
import numpy as np
from itertools import permutations 
import tqdm

In [None]:
# cluster1 = ['http://dbpedia.org/resource/PHP']
# cluster2 = ['http://dbpedia.org/resource/JavaScript']

# cluster1 = ['http://dbpedia.org/resource/PHP', 'http://dbpedia.org/resource/ECMAScript']
# cluster2 = ['http://dbpedia.org/resource/JavaScript', 'http://dbpedia.org/resource/Dynamic_HTML', 'http://dbpedia.org/resource/PHP']

# cluster1 = ['http://dbpedia.org/resource/Fortran', 'http://dbpedia.org/resource/JavaScript', 'http://dbpedia.org/resource/PHP', 'http://dbpedia.org/resource/C++']
# cluster2 = ['http://dbpedia.org/resource/ECMAScript', 'http://dbpedia.org/resource/JavaScript', 'http://dbpedia.org/resource/Dynamic_HTML', 'http://dbpedia.org/resource/VBScript']

In [None]:
# getPathsBetweenTwoNodes Function returns dataframe of All possible paths between two Nodes
def getPathsBetweenTwoNodes(A, B): 
  url = 'http://dbpedia.org/sparql/'
  query = """
  SELECT *
  WHERE
  {
    {
    <""" + A + """>  ?r1 <""" + B + """> .
    }
    UNION
    {
      <""" + A + """> ?r1 ?node1 .
      ?node1 ?r2 <""" + B + """> .
    }
    UNION
    {
      <""" + A + """> ?r1 ?node1 .
      ?node1 ?r2 ?node2 .
      ?node2 ?r3 <""" + B + """> .
    }
    # Add additional UNION for each length of path you want up to your upper bound
  }
  """
  r = requests.get(url, params = {'format': 'json', 'query': query})
  data = r.json()

  subgraph = []
  for item in data['results']['bindings']:
      subgraph.append(OrderedDict({
        'source_node': A, 
        'r1': item['r1']['value'],
        'node1': item['node1']['value']
            if 'node1' in item else "",
        'r2': item['r2']['value']
            if 'r2' in item else "",
        'node2': item['node2']['value']
            if 'node2' in item else "",
        'r3': item['r3']['value']
            if 'r3' in item else "",
        'target_node': B
        }))

  df = pd.DataFrame(subgraph)
  return df

In [None]:
def getFullGraph(cluster1, cluster2):
  clusterNodesList = cluster1 + cluster2
  clusterNodesList = list(dict.fromkeys(clusterNodesList))

  perm = permutations(clusterNodesList, 2) 
  frames = []

  for element in list(perm):
    df = getPathsBetweenTwoNodes(element[0], element[1])
    frames.append(df)

  result = pd.concat(frames)
  return result

#Get the set of all unique Nodes and Edge sets

In [None]:
def getUniqueNodesFromGraph(nodes_filter_graph):
  # Get all Unique Nodes
  unique_nodes_set = list(nodes_filter_graph.source_node.unique()) + list(nodes_filter_graph.node1.unique()) + list(nodes_filter_graph.node2.unique()) + list(nodes_filter_graph.target_node.unique())
  unique_nodes_set = np.unique(unique_nodes_set)
  # storing unique nodes in dataframe
  unique_nodes_df = pd.DataFrame(unique_nodes_set.tolist(), columns=['Node'])
  unique_nodes_df["id"] = unique_nodes_df.index
  unique_nodes_df.head()
  return unique_nodes_df

def getUniqueRelationsFromGraph(graphFull):
  #Filtered table with only relations
  edge_type_filter = graphFull[['r1', 'r2', 'r3']]
  # Get set of edge types in the graph
  edge_type_set = list(edge_type_filter.r1.unique()) + list(edge_type_filter.r2.unique()) + list(edge_type_filter.r3.unique())
  edge_type_set = np.unique(edge_type_set)

  # storing unique edges in dataframe
  unique_edges_df = pd.DataFrame(edge_type_set.tolist(), columns=['Edge'])
  unique_edges_df["id"] = unique_edges_df.index
  unique_edges_df.head(18)
  return unique_edges_df

#Map node and edge to respective id

In [None]:
def mapNodeWithId(node):
  if node != "":
    df = unique_nodes_df.query('Node == "' + node + '"', inplace = False)
    id = df["id"].tolist()
    return str(id[0])
  else:
    return ""

def mapPredicateWithId(predicate):
  if predicate != "":
    df = unique_edges_df.query('Edge == "' + predicate + '"', inplace = False)
    id = df["id"].tolist()
    return str(id[0])
  else:
    return ""

**Get the Edge set**

In [None]:
pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
pd.describe_option('max_colwidth')

display.max_colwidth : int or None
    The maximum width in characters of a column in the repr of
    a pandas data structure. When the column overflows, a "..."
    placeholder is embedded in the output. A 'None' value means unlimited.
    [default: 50] [currently: 400]


In [None]:
def getAllRelationSets(graphFull):
  # Subject -- Predicate -- object
  relation_set = []
  for index, row in graphFull.iterrows():
    # for r1
    if row["node1"] == '':
      relation_set.append([row["source_node"], row["r1"], row["target_node"]])
    else:
      relation_set.append([row["source_node"], row["r1"], row["node1"]])
    # for r2
    if row["node2"] == '' and row["node1"] != '':
      relation_set.append([row["node1"], row["r2"], row["target_node"]])
    else:
      relation_set.append([row["node1"], row["r2"], row["node2"]])
    # for r3
    if row["node2"] != '':
      relation_set.append([row["node2"], row["r3"], row["target_node"]])

  relation_Dataframe = pd.DataFrame(relation_set, columns= ['start', 'relation', 'end'])
  return relation_Dataframe

#Group Predicates, objects and object-predicate pair with their number of occurence

**Unique Relation Dataset**

In [None]:
def getWeightCalcHelpers(relation_Dataframe):
  unique_relations_count_series = relation_Dataframe.groupby(['start', 'relation', 'end']).size()
  unique_relations_count_df = unique_relations_count_series.to_frame(name = 'size').reset_index()
  unique_relation_Dataframe = unique_relations_count_df.copy()
  unique_relation_Dataframe.drop(['size'], axis = 1)

  predicate_count_series = relation_Dataframe.groupby(['relation']).size()
  predicate_count_df = predicate_count_series.to_frame(name = 'size').reset_index()

  object_count_series = relation_Dataframe.groupby(['end']).size()
  object_count_df = object_count_series.to_frame(name = 'size').reset_index()

  pred_obj_series = relation_Dataframe.groupby(['relation', 'end']).size()
  pred_obj_count_df = pred_obj_series.to_frame(name = 'size').reset_index()

  return unique_relation_Dataframe, predicate_count_df, object_count_df, pred_obj_count_df

#Semantic relation weighting

**Calculate IC for predicate and object**

In [None]:
def getICpredicate(predicate, predicate_count_df, all_pred_count):
  given_pred_count = list(predicate_count_df.query('relation == "' + predicate + '"', inplace = False)["size"])[0]
  prob_pred = given_pred_count/all_pred_count
  ic_pred = -np.log(prob_pred)
  return ic_pred

In [None]:
def getICobj(obj, object_count_df, all_obj_count):
  given_obj_count = list(object_count_df.query('end == "' + obj + '"', inplace = False)["size"])[0]
  prob_obj = given_obj_count/all_obj_count
  ic_obj = -np.log(prob_obj)
  return ic_obj

In [None]:
def getICobj_given_pred(obj, pred, predicate_count_df, pred_obj_count_df, total_relation):
  given_pred_count = list(predicate_count_df.query('relation == "' + pred + '"', inplace = False)["size"])[0]

  prob_pred = given_pred_count/total_relation

  obj_and_pred_count = list(pred_obj_count_df.query('end == "'+ obj + '" and relation == "' + pred + '"', inplace = False)["size"])[0]

  prob_obj_and_pred = obj_and_pred_count/total_relation

  prob_obj_given_pred = prob_obj_and_pred/prob_pred
  
  ic_obj_and_pred = -np.log(prob_obj_given_pred)
  return ic_obj_and_pred

In [None]:
def getPMI(obj, pred, total_relation, predicate_count_df, pred_obj_count_df, object_count_df):
  given_pred_count = list(predicate_count_df.query('relation == "' + pred + '"', inplace = False)["size"])[0]

  prob_pred = given_pred_count/total_relation

  obj_and_pred_count = list(pred_obj_count_df.query('end == "'+ obj + '" and relation == "' + pred + '"', inplace = False)["size"])[0]

  prob_obj_and_pred = obj_and_pred_count/total_relation

  prob_obj_given_pred = prob_obj_and_pred/prob_pred

  given_obj_count = list(object_count_df.query('end == "' + obj + '"', inplace = False)["size"])[0]
  prob_obj = given_obj_count/total_relation

  pmi = np.log(prob_obj_given_pred/prob_obj)
  return pmi

**Joint IC, combIC and ic+pmi**

In [None]:
def jointIC(pred, obj, predicate_count_df, pred_obj_count_df, all_pred_count, total_relation):
  ic_pred = getICpredicate(pred, predicate_count_df, all_pred_count)
  ic_obj_given_pred = getICobj_given_pred(obj, pred, predicate_count_df, pred_obj_count_df, total_relation)
  weight = ic_pred + ic_obj_given_pred
  return weight

In [None]:
def combIC(pred, obj, predicate_count_df, object_count_df, all_pred_count, all_obj_count):
  ic_pred = getICpredicate(pred, predicate_count_df, all_pred_count)
  ic_obj = getICobj(obj, object_count_df, all_obj_count)
  weight = ic_pred + ic_obj
  return weight

In [None]:
def ic_and_pmi(pred, obj, predicate_count_df, pred_obj_count_df, object_count_df, all_pred_count, total_relation):
  ic_pred = getICpredicate(pred, predicate_count_df, all_pred_count)
  pmi_pred_obj = getPMI(obj, pred, total_relation, predicate_count_df, pred_obj_count_df, object_count_df)
  weight = ic_pred + pmi_pred_obj
  return weight

#Assign Weights on relation Dataframe

**Joint IC, combIC, ic&pmi Scheme**

In [None]:
def assignWeightsToEdges(unique_relation_Dataframe, predicate_count_df, object_count_df, pred_obj_count_df, all_pred_count, all_obj_count, total_relation):
  # Assign Weights to Edge set 
  edge_set_with_weight = []
  for index, row in unique_relation_Dataframe.iterrows():
    if row["start"] != "" and row["relation"] != "" and row["end"] != "":
      joint_ic_weight = jointIC(row["relation"], row["end"], predicate_count_df, pred_obj_count_df, all_pred_count, total_relation)
      comb_ic_weight = combIC(row["relation"], row["end"], predicate_count_df, object_count_df, all_pred_count, all_obj_count)
      ic_pmi_weight = ic_and_pmi(row["relation"], row["end"], predicate_count_df, pred_obj_count_df, object_count_df, all_pred_count, total_relation)
      edge_set_with_weight.append([row["start"], row["relation"], row["end"], joint_ic_weight, comb_ic_weight, ic_pmi_weight])
  edge_set_weight_DF = pd.DataFrame(edge_set_with_weight, columns= ['Start', 'Relation', 'End', 'Joint_IC_Weight', 'Comb_IC_Weight', 'IC_PMI_weight'])
  return edge_set_weight_DF

#Compute Cost of all the Edges
---
c(e) = wmax - w (e) ;


In [None]:
def assignCostToEdges(edge_set_weight_DF):
  edge_set_ic_Cost = []
  max_joint_weight = edge_set_weight_DF['Joint_IC_Weight'].max()
  max_comb_weight = edge_set_weight_DF['Comb_IC_Weight'].max()
  max_ic_pmi_weight = edge_set_weight_DF['IC_PMI_weight'].max()

  for index, row in edge_set_weight_DF.iterrows():
    joint_cost = max_joint_weight - row["Joint_IC_Weight"]
    comb_cost = max_comb_weight - row["Comb_IC_Weight"]
    ic_pmi_cost = max_ic_pmi_weight - row["IC_PMI_weight"]
    edge_set_ic_Cost.append([row["Start"], row["Relation"], row["End"], row["Joint_IC_Weight"], joint_cost, row["Comb_IC_Weight"], comb_cost, row["IC_PMI_weight"], ic_pmi_cost])

  edge_set_weight_cost_DF = pd.DataFrame(edge_set_ic_Cost, columns= ['Start', 'Relation', 'End', 'Joint_IC_Weight', 'Joint_Cost', 'Comb_IC_Weight', 'Comb_cost', 'IC_PMI_weight', 'IC_PMI_Cost'])
  return edge_set_weight_cost_DF

#Assign total weight and total cost (with 3 schemes) of all paths in graphFull dataset

In [None]:
def assignWeightAndCostToGraph(graphFull, edge_set_weight_cost_DF):
  graphWithPathWeight = []
  for index, row in graphFull.iterrows():
    joint_weight = 0
    comb_weight = 0
    ic_pmi_weight = 0
    if row["node1"] == '':   #only one relation between source and target
      df = edge_set_weight_cost_DF.query('Start == "'+ row["source_node"] + '" and Relation == "' + row["r1"] + '" and End == "' + row["target_node"] + '"', inplace = False)
      joint_weight = df["Joint_IC_Weight"].values[0]
      comb_weight = df["Comb_IC_Weight"].values[0]
      ic_pmi_weight = df["IC_PMI_weight"].values[0]

      joint_cost = df["Joint_Cost"].values[0]
      comb_cost = df["Comb_cost"].values[0]
      ic_pmi_cost = df["IC_PMI_Cost"].values[0]
      graphWithPathWeight.append([row["source_node"], row["r1"], "", "", "", "", row["target_node"], joint_weight, comb_weight, ic_pmi_weight, joint_cost, comb_cost, ic_pmi_cost])
    elif row["node1"] != '' and row["node2"] == '':  # one intermediate node between source and target
      df1 = edge_set_weight_cost_DF.query('Start == "'+ row["source_node"] + '" and Relation == "' + row["r1"] + '" and End == "' + row["node1"] + '"', inplace = False)
      df2 = edge_set_weight_cost_DF.query('Start == "'+ row["node1"] + '" and Relation == "' + row["r2"] + '" and End == "' + row["target_node"] + '"', inplace = False)
      joint_weight = df1["Joint_IC_Weight"].values[0] + df2["Joint_IC_Weight"].values[0]
      comb_weight = df1["Comb_IC_Weight"].values[0] + df2["Comb_IC_Weight"].values[0]
      ic_pmi_weight = df1["IC_PMI_weight"].values[0] + df2["IC_PMI_weight"].values[0]

      joint_cost = df1["Joint_Cost"].values[0] + df2["Joint_Cost"].values[0]
      comb_cost = df1["Comb_cost"].values[0] + df2["Comb_cost"].values[0]
      ic_pmi_cost = df1["IC_PMI_Cost"].values[0] + df2["IC_PMI_Cost"].values[0]
      graphWithPathWeight.append([row["source_node"], row["r1"], row["node1"], row["r2"], "", "", row["target_node"], joint_weight, comb_weight, ic_pmi_weight, joint_cost, comb_cost, ic_pmi_cost])
    else:
      df1 = edge_set_weight_cost_DF.query('Start == "'+ row["source_node"] + '" and Relation == "' + row["r1"] + '" and End == "' + row["node1"] + '"', inplace = False)
      df2 = edge_set_weight_cost_DF.query('Start == "'+ row["node1"] + '" and Relation == "' + row["r2"] + '" and End == "' + row["node2"] + '"', inplace = False)
      df3 = edge_set_weight_cost_DF.query('Start == "'+ row["node2"] + '" and Relation == "' + row["r3"] + '" and End == "' + row["target_node"] + '"', inplace = False)
      joint_weight = df1["Joint_IC_Weight"].values[0] + df2["Joint_IC_Weight"].values[0] + df3["Joint_IC_Weight"].values[0]
      comb_weight = df1["Comb_IC_Weight"].values[0] + df2["Comb_IC_Weight"].values[0] + df3["Comb_IC_Weight"].values[0]
      ic_pmi_weight = df1["IC_PMI_weight"].values[0] + df2["IC_PMI_weight"].values[0] + df3["IC_PMI_weight"].values[0]

      joint_cost = df1["Joint_Cost"].values[0] + df2["Joint_Cost"].values[0] +  + df3["Joint_Cost"].values[0]
      comb_cost = df1["Comb_cost"].values[0] + df2["Comb_cost"].values[0] + df3["Comb_cost"].values[0]
      ic_pmi_cost = df1["IC_PMI_Cost"].values[0] + df2["IC_PMI_Cost"].values[0] + df3["IC_PMI_Cost"].values[0]
      graphWithPathWeight.append([row["source_node"], row["r1"], row["node1"], row["r2"], row["node2"], row["r3"], row["target_node"], joint_weight, comb_weight, ic_pmi_weight, joint_cost, comb_cost, ic_pmi_cost])

  graphFullWith_ic_Weights_Cost = pd.DataFrame(graphWithPathWeight, columns= ['source_node', 'r1', 'node1', 'r2', 'node2', 'r3', 'target_node', 'Joint_Weight', 'Comb_Weight', 'IC_PMI_Weight', 'Joint_Cost', 'Comb_Cost', 'IC_PMI_Cost'])

  return graphFullWith_ic_Weights_Cost

**Normalize all the cost**

In [None]:
def getGraphWithNormalizedCost(graphFullWith_ic_Weights_Cost):
  max_joint_cost = graphFullWith_ic_Weights_Cost['Joint_Cost'].max()
  max_comb_cost = graphFullWith_ic_Weights_Cost['Comb_Cost'].max()
  max_ic_pmi_cost = graphFullWith_ic_Weights_Cost['IC_PMI_Cost'].max()


  graphWithNormalizedCost = []
  for index, row in graphFullWith_ic_Weights_Cost.iterrows():
    normalized_joint = row["Joint_Cost"] / max_joint_cost
    normalized_comb = row["Comb_Cost"] / max_comb_cost
    normalized_ic_pmi = row["IC_PMI_Cost"] / max_ic_pmi_cost
    graphWithNormalizedCost.append([row["source_node"], row["r1"], row["node1"], row["r2"], row["node2"], row["r3"], row["target_node"], row["Joint_Weight"], normalized_joint, row["Comb_Weight"], normalized_comb, row["IC_PMI_Weight"], normalized_ic_pmi])

  graphWithNormalizedCost_DF = pd.DataFrame(graphWithNormalizedCost, columns= ['source_node', 'r1', 'node1', 'r2', 'node2', 'r3', 'target_node', 'Joint_Weight', 'Joint_Cost', 'Comb_Weight', 'Comb_cost', 'IC_PMI_Weight', 'IC_PMI_Cost'])
  return graphWithNormalizedCost_DF

**Get Minimum cost between a pair of Nodes**

In [None]:
def getMinimumCost(df, source, target, weight_scheme):
  filtered_df = df.query('source_node == "'+ source + '" and target_node == "' + target + '"', inplace = False)
  
  if filtered_df.empty:
    cols = ['source_node', 'r1', 'node1', 'r2', 'node2', 'r3', 'target_node', 'Joint_Weight', 'Joint_Cost', 'Comb_Weight', 'Comb_cost', 'IC_PMI_Weight', 'IC_PMI_Cost']
    temp_df = pd.concat([pd.DataFrame({k: [] for k in cols}), None, None])
    return temp_df
  
  if weight_scheme == 'comb_ic':
    return filtered_df[filtered_df.Comb_cost == filtered_df.Comb_cost.min()]
  elif weight_scheme == 'ic_pmi':
    return filtered_df[filtered_df.IC_PMI_Cost == filtered_df.IC_PMI_Cost.min()]
  elif weight_scheme == 'joint_ic':
    return filtered_df[filtered_df.Joint_Cost == filtered_df.Joint_Cost.min()]
  else:
    return filtered_df[filtered_df.Joint_Cost == filtered_df.Joint_Cost.min()]

In [None]:
def getCheapestPath(df, source, target, weight_scheme):
  df1 = getMinimumCost(df, source, target, weight_scheme)
  df2 = getMinimumCost(df, target, source, weight_scheme)

  if weight_scheme == 'comb_ic':
    cost_name = 'Comb_cost'
  elif weight_scheme == 'ic_pmi':
    cost_name = 'IC_PMI_Cost'
  elif weight_scheme == 'joint_ic':
    cost_name = 'Joint_Cost'
  else:
    cost_name = 'Joint_Cost'

  if not df1.empty:
    c1 = df1[cost_name].values[0]
  else:
    c1 = 1

  if not df2.empty:
    c2 = df2[cost_name].values[0]
  else:
    c2 = 1

  if c1 < c2:
    return c1
  else:
    return c2

#Mapping Skills to Dbpedia links

In [None]:
def annotate_with_Dbpedia_spotlight(text, confidence):
  # text preprocessing
  text = text.replace("_", " ").replace("-", " ")
  URL = "https://api.dbpedia-spotlight.org/en/annotate?text=" + text + "&confidence=" + str(confidence) + ""
  HEADERS = {'Accept': 'application/json'}
  response = requests.get(URL, headers=HEADERS)
  if response.status_code != 200:
    return 0
  
  json_obj = response.json()
  if "Resources" in json_obj:
    return json_obj["Resources"][0]['@URI']
  else:
    return 0

# annotate_with_Dbpedia_spotlight('Machine learning', 0.5)

In [None]:
skill_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Knowledge graph files/skill_name_list.csv")

In [None]:
skill_df.head(10)

**Map Skill name to Link**

In [None]:
def map_skills(skills_to_be_mapped):
# skills_to_be_mapped = ['mysql','oracle','postgresql','database']
  cluster = []
  for skill in skills_to_be_mapped:
    link = annotate_with_Dbpedia_spotlight(skill, 0.5)
    if link == 0:
      confidence = 0.4
      while confidence >= 0:
        link = annotate_with_Dbpedia_spotlight(skill, confidence)
        confidence = confidence - 0.1
    if link == 0:
      link_list = list(skill_df.query('name == "'+ skill.lower() + '"', inplace = False)["link"])
      if link_list:
        link = link_list[0]
      else:
        link = 0
    cluster.append(link)
    print(link)
  return cluster

# Build Graph with Nodes

- And Assign weights and Cost to connecting Edges

In [None]:
def buildGraphWithCost(cluster1, cluster2):
  print('Runing Sparql')
  graphFull = getFullGraph(cluster1, cluster2)
  # Filtered table with only nodes
  nodes_filter_graph = graphFull[['source_node', 'node1', 'node2', 'target_node']]

  unique_nodes_df = getUniqueNodesFromGraph(nodes_filter_graph)
  unique_edges_df = getUniqueRelationsFromGraph(graphFull)

  relation_Dataframe = getAllRelationSets(graphFull)

  all_pred_count = relation_Dataframe.query('relation != ""', inplace = False).shape[0]
  all_obj_count = relation_Dataframe.query('end != ""', inplace = False).shape[0]
  total_relation = relation_Dataframe.query('relation != ""', inplace = False).shape[0]

  unique_relation_Dataframe, predicate_count_df, object_count_df, pred_obj_count_df = getWeightCalcHelpers(relation_Dataframe)

  # Assign Weights to Edge Sets
  print('Assigning Weights')
  edge_set_weight_DF = assignWeightsToEdges(unique_relation_Dataframe, predicate_count_df, object_count_df, pred_obj_count_df, all_pred_count, all_obj_count, total_relation)

  # Compute Cost for Edge sets
  edge_set_weight_cost_DF = assignCostToEdges(edge_set_weight_DF)

  # Compute total weight and cost for all connecting paths between Nodes in the graph
  print('Assigning Cost to Graph')
  graphFullWith_ic_Weights_Cost = assignWeightAndCostToGraph(graphFull, edge_set_weight_cost_DF)

  # Normalized Cost
  graphWithNormalizedCost_DF = getGraphWithNormalizedCost(graphFullWith_ic_Weights_Cost)

  return graphWithNormalizedCost_DF

In [None]:
# cluster1 = ['.net','perl','nosql', 'groovy','.net-framework', 'selenium','javascript','sql','ansible','relational-database','git']
# cluster2 = ['.net','continuous-integration','perl','nosql','groovy','postgresql','.net-framework','c++','selenium','oracle','mysql','scripting-language','javascript','sql','ansible','relational-database','database','git']

# if len(cluster1) <= len(cluster2):
#   smaller_cluster_len = len(cluster1)
# else:
#   smaller_cluster_len = len(cluster2)
# # count common_elements
# num_common_elements = (len(list(set(cluster1).intersection(cluster2)))/smaller_cluster_len) * 100

# # remove the common nodes from two clusters
# cluster1_new = list(set(cluster1) - set(cluster2))
# cluster2_new = list(set(cluster2) - set(cluster1))
# num_common_elements
# # normalized_cost_df = buildGraphWithCost(cluster1, cluster2)

100.0

# Prepare Edit Distance Matrix


Matrix:

```
#             Ecma  Javascript HTML VB
# Fortran    [ 0     x         y     z ]
# Javascript [ a     b         c     z ]
# PHP        [ a     b         c     z ]
# C++        [ a     b         c     z ]
```





**IC+PMI edit Distance Matrix**

In [None]:
from scipy.optimize import linear_sum_assignment

**Prepare Matrix**

In [None]:
def prepare_edit_cost_matrix(weight_scheme, graphWithNormalizedCost_DF):
  list_outer = []
  for item in cluster1:
    list_inner = []
    for element in cluster2:
      if item != element:
        total_cost = getCheapestPath(graphWithNormalizedCost_DF, element, item, weight_scheme)
      else:
        total_cost = 0
      list_inner.append(total_cost)
    list_outer.append(list_inner)
  final_matrix = np.array(list_outer, np.float64)
  return final_matrix

In [None]:
cluster1 = ['http://dbpedia.org/resource/DevOps', 'http://dbpedia.org/resource/Security', 'http://dbpedia.org/page/Scrum_(software_development)',
            'http://dbpedia.org/resource/Microservices', 'http://dbpedia.org/resource/Data_science']
cluster2 = ['http://dbpedia.org/resource/Robotics', 'http://dbpedia.org/resource/DevOps', 'http://dbpedia.org/resource/Linux',
            'http://dbpedia.org/resource/Microservices']
# normalized_cost_df = buildGraphWithCost(cluster1, cluster2)

# # computing costs as per 3 different schemes
# ic_pmi_matrix = prepare_edit_cost_matrix('ic_pmi', normalized_cost_df)
# row_ind, col_ind = linear_sum_assignment(ic_pmi_matrix)
# ic_pmi_similarity_score = ic_pmi_matrix[row_ind, col_ind].sum()

# joint_ic_matrix = prepare_edit_cost_matrix('joint_ic', normalized_cost_df)
# row_ind, col_ind = linear_sum_assignment(joint_ic_matrix)
# joint_ic_similarity_score = joint_ic_matrix[row_ind, col_ind].sum()

# comb_ic_matrix = prepare_edit_cost_matrix('comb_ic', normalized_cost_df)
# row_ind, col_ind = linear_sum_assignment(comb_ic_matrix)
# comb_ic_similarity_score = comb_ic_matrix[row_ind, col_ind].sum()

IndexError: ignored

In [None]:
print(ic_pmi_matrix)
print(ic_pmi_similarity_score)
print("==============")
print(joint_ic_matrix)
print(joint_ic_similarity_score)
print("==============")
print(comb_ic_matrix)
print(comb_ic_similarity_score)

[[0.76217169 0.         0.55189819 0.3261407 ]
 [0.46872839 0.6523435  0.4632961  0.67804354]
 [1.         1.         1.         1.        ]
 [0.65185475 0.3261407  0.53154725 0.        ]
 [0.32616079 0.84601947 0.65318381 0.97752862]]
0.7894568909479087
[[0.4179354  0.         0.17179201 0.18384327]
 [0.2207421  0.24131767 0.16309381 0.36684558]
 [1.         1.         1.         1.        ]
 [0.43918076 0.18384327 0.21843866 0.        ]
 [0.11733579 0.36667588 0.24726281 0.36599191]]
0.2804296004108966
[[0.68835032 0.         0.56225123 0.29842561]
 [0.46052879 0.57643795 0.48086796 0.66327371]
 [1.         1.         1.         1.        ]
 [0.60524661 0.29842561 0.52176652 0.        ]
 [0.34254474 0.7090817  0.58752702 0.73799421]]
0.8234127002206915


In [None]:
comb_ic_similarity_score

In [None]:
print(ic_pmi_similarity_score)
print(joint_ic_similarity_score)
print(comb_ic_similarity_score)

# Compare two time period Graphs

In [None]:
# list(skill_df.query('name == "compiler"', inplace = False)["link"])

[]

In [None]:
cluster = map_skills(['version-control','softwre-engineering','continuous-integration','hybris','agile','c++','scripting-language'])
print(cluster)


http://dbpedia.org/resource/Version_control
0
http://dbpedia.org/resource/Continuous_integration
http://dbpedia.org/resource/Hubris
http://dbpedia.org/resource/Agile_software_development
http://dbpedia.org/resource/C++
http://dbpedia.org/resource/Scripting_language
['http://dbpedia.org/resource/Version_control', 0, 'http://dbpedia.org/resource/Continuous_integration', 'http://dbpedia.org/resource/Hubris', 'http://dbpedia.org/resource/Agile_software_development', 'http://dbpedia.org/resource/C++', 'http://dbpedia.org/resource/Scripting_language']


In [None]:
period_a = [
	['devops','security','scrum','microservices','data-science'],
	['mysql','oracle','postgresql','database'],
	['cad','data-conversion'],
	['sql','ansible','.net','groovy','perl','.net-framework'],
	['tcl','project-management'],
	['statistics','compiler','monitoring','mongodb'],
	['responsive-web-design','design','multithreading','virtualization','sharepoint'],
	['data-management','data-warehouse','business-intelligence','data-modeling','big-data','architect','computer-science'],
	['data-analysis','elasticsearch','machine-learning','data-mining'],
	['javascript','selenium'],
	['embedded-linux','linux'],
	['software-architecture','matlab'],
	['html','user-interface','user-experience'],
	['nosql','relational-database','git'],
	['version-control','software-engineering','continuous-integration','hybris','agile','c++','scripting-language']
]

period_b = [
	['sqlite','continuous-integration','scripting-language'],
	['autosar','software-design'],
	['.net-framework','.net','sql','groovy','perl'],
	['robotics','devops','linux','microservices'],
	['oracle','matlab','json','xml','gradle','database','image-processing'],
	['mongodb','simulation','hardware','scada','design'],
	['extreme-programming','scrum'],
	['lucene','architect','data-science','big-data'],
	['artificial-intelligence','product-management','javascript','data-warehouse','jira','html','data-mining'],
	['data-analysis','business-intelligence','software-architecture','cloud'],
	['linked-data','relational-database','nosql','project-management'],
	['monitoring','version-control','git','ansible','elasticsearch'],
	['html5','amazon-web-services'],
	['sharepoint','refactoring','software-engineering','ios'],
	['php','c++','unit-testing','mysql','angularjs','symfony'],
	['statistics','computer-science','data-conversion','compiler','user-experience','user-interface'],
	['security','machine-learning','data-management','opengl','r']
]

In [None]:
for cluster_a in period_a:
  for cluster_b in period_b:
    print(cluster_a)
    print(cluster_b)
  break;

['devops', 'security', 'scrum', 'microservices', 'data-science']
['sqlite', 'continuous-integration', 'scripting-language']
['devops', 'security', 'scrum', 'microservices', 'data-science']
['autosar', 'software-design']
['devops', 'security', 'scrum', 'microservices', 'data-science']
['.net-framework', '.net', 'sql', 'groovy', 'perl']
['devops', 'security', 'scrum', 'microservices', 'data-science']
['robotics', 'devops', 'linux', 'microservices']
['devops', 'security', 'scrum', 'microservices', 'data-science']
['oracle', 'matlab', 'json', 'xml', 'gradle', 'database', 'image-processing']
['devops', 'security', 'scrum', 'microservices', 'data-science']
['mongodb', 'simulation', 'hardware', 'scada', 'design']
['devops', 'security', 'scrum', 'microservices', 'data-science']
['extreme-programming', 'scrum']
['devops', 'security', 'scrum', 'microservices', 'data-science']
['lucene', 'architect', 'data-science', 'big-data']
['devops', 'security', 'scrum', 'microservices', 'data-science']
['ar

# get exclusivity of a relation

In [None]:
# def getWeightOfEdge(dataset, source, predicate, target):
#   # calculate exclusivity
#   outgoingPredicatesCount = dataset.query(
#             'start == "'+ source + '" and relation == "' + predicate + '"', inplace = False).shape[0]
#   incomingPredicatesCount = dataset.query(
#             'end == "'+ target + '" and relation == "' + predicate + '"', inplace = False).shape[0]
#   exclusivity = 1/(outgoingPredicatesCount + incomingPredicatesCount - 1)
#   return exclusivity

In [None]:
# d = getWeightOfEdge(relation_Dataframe, 'http://dbpedia.org/resource/JavaScript', 'http://dbpedia.org/ontology/wikiPageWikiLink', 'http://dbpedia.org/resource/Google_Chrome')
# d

In [None]:
#  # Assign Exclusivity to Edge set 
#  edge_set_with_exclusivity = []
#  for index, row in relation_Dataframe.iterrows():
#    if row["start"] != "" and row["relation"] != "" and row["end"] != "":
#      e = getWeightOfEdge(relation_Dataframe, row["start"], row["relation"], row["end"])
#      edge_set_with_exclusivity.append([row["start"], row["relation"], row["end"], e])
#  edge_set_with_exclusivity_DF = pd.DataFrame(edge_set_with_exclusivity, columns= ['Start', 'Relation', 'End', 'Exclusivity']) 
#  edge_set_with_exclusivity_DF.head(5)

In [None]:
# edge_set_with_exclusivity_DF_filtered = edge_set_with_exclusivity_DF.copy()
# edge_set_with_exclusivity_DF_filtered.shape


In [None]:
# edge_set_with_exclusivity_DF_filtered = edge_set_with_exclusivity_DF_filtered.drop_duplicates()
# edge_set_with_exclusivity_DF_filtered.head()
# edge_set_with_exclusivity_DF_filtered.shape

# Assign total weight of all paths in graphFull dataset

In [None]:
# graphWithPathWeight = []
# for index, row in graphFull.iterrows():
#   weight = 0
#   total_rev_exclusivity = 0
#   if row["node1"] == '':
#     df = edge_set_with_exclusivity_DF_filtered.query('Start == "'+ row["source_node"] + '" and Relation == "' + row["r1"] + '" and End == "' + row["target_node"] + '"', inplace = False)
#     total_rev_exclusivity = 1/ df["Exclusivity"].values[0]
#     weight = 1/ total_rev_exclusivity
#     graphWithPathWeight.append([row["source_node"], row["r1"], "", "", "", "", row["target_node"], weight])
#   elif row["node1"] != '' and row["node2"] == '':
#     df1 = edge_set_with_exclusivity_DF_filtered.query('Start == "'+ row["source_node"] + '" and Relation == "' + row["r1"] + '" and End == "' + row["node1"] + '"', inplace = False)
#     df2 = edge_set_with_exclusivity_DF_filtered.query('Start == "'+ row["node1"] + '" and Relation == "' + row["r2"] + '" and End == "' + row["target_node"] + '"', inplace = False)
#     total_rev_exclusivity = (1/df1["Exclusivity"].values[0]) + (1/df2["Exclusivity"].values[0])
#     weight = 1/ total_rev_exclusivity
#     graphWithPathWeight.append([row["source_node"], row["r1"], row["node1"], row["r2"], "", "", row["target_node"], weight])
#   else:
#     df1 = edge_set_with_exclusivity_DF_filtered.query('Start == "'+ row["source_node"] + '" and Relation == "' + row["r1"] + '" and End == "' + row["node1"] + '"', inplace = False)
#     df2 = edge_set_with_exclusivity_DF_filtered.query('Start == "'+ row["node1"] + '" and Relation == "' + row["r2"] + '" and End == "' + row["node2"] + '"', inplace = False)
#     df3 = edge_set_with_exclusivity_DF_filtered.query('Start == "'+ row["node2"] + '" and Relation == "' + row["r3"] + '" and End == "' + row["target_node"] + '"', inplace = False)
#     total_rev_exclusivity = (1/df1["Exclusivity"].values[0]) + (1/df2["Exclusivity"].values[0]) + (1/df3["Exclusivity"].values[0])
#     weight = 1/ total_rev_exclusivity
#     graphWithPathWeight.append([row["source_node"], row["r1"], row["node1"], row["r2"], row["node2"], row["r3"], row["target_node"], weight])

# graphFullWithWeights = pd.DataFrame(graphWithPathWeight, columns= ['source_node', 'r1', 'node1', 'r2', 'node2', 'r3', 'target_node', 'Weight'])
# graphFullWithWeights.shape


In [None]:
# graphFullWithWeights.tail(10)