# Cluster Mapping (using 3 rules)

Rules
----------------------------------
1. If two distinct subjects share the same predicate, and for that predicate the same object, then both are given weight as 'similar'
2. If two distinct subjects have similar direct neighbor nodes, then they are considered similar (for this we can give a threshold for the number of direct neighbor nodes that are similar)
3. If two distinct subjects has neighbor nodes, which has many attributes that are similar (exclude the common properties such as rdf:type, etc), then both can be considered as similar

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.optimize import linear_sum_assignment

import warnings
warnings.filterwarnings("ignore")

**Reading files for two adjacent time frames**

In [None]:
period_1 = 'AprJun2016'
period_2 = 'JulSep2016'

In [None]:
t1_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Particle Density Algo/results/AprJun2016.csv", keep_default_na=False)
t2_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Particle Density Algo/results/JulSep2016.csv", keep_default_na=False)

t1_df.columns = ['Network','Label','Density','Centrality','Quadrant','TimeLabel','Country','NodeInternalList','EdgeInternalList','EdgeExternalList','NodeExternalList']
t2_df.columns = ['Network','Label','Density','Centrality','Quadrant','TimeLabel','Country','NodeInternalList','EdgeInternalList','EdgeExternalList','NodeExternalList']

In [None]:
def getClusterNode(network_id, network_df):
  c_nodes = list(network_df.query('Network == "'+ network_id + '"', inplace = False)["NodeInternalList"])[0]
  c_node_list = list(c_nodes.split('#'))
  # empty string removal
  c_node_list = [i for i in c_node_list if i] 
  return c_node_list

#Mapping Skill names to Dbpedia Link

In [None]:
import requests
def annotate_with_Dbpedia_spotlight(text, confidence):
  # text preprocessing
  text = text.replace("_", " ").replace("-", " ")
  URL = "https://api.dbpedia-spotlight.org/en/annotate?text=" + text + "&confidence=" + str(confidence) + ""
  HEADERS = {'Accept': 'application/json'}
  response = requests.get(URL, headers=HEADERS)
  if response.status_code != 200:
    return 0
  
  json_obj = response.json()
  if "Resources" in json_obj:
    return json_obj["Resources"][0]['@URI']
  else:
    return 0

In [None]:
def map_skills(skills_to_be_mapped):
# skills_to_be_mapped = ['mysql','oracle','postgresql','database']
  skill_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Knowledge graph files/skill_links_map.csv')
  cluster = []
  for skill in skills_to_be_mapped:
    link = annotate_with_Dbpedia_spotlight(skill, 0.5)
    if link == 0:
      confidence = 0.4
      while confidence >= 0:
        link = annotate_with_Dbpedia_spotlight(skill, confidence)
        confidence = confidence - 0.1
    if link == 0:
      link_list = list(skill_df.query('name == "'+ skill.lower() + '"', inplace = False)["link"])
      if link_list:
        link = link_list[0]
      else:
        link = 0
    cluster.append(link)
  return cluster

 
  link = annotate_with_Dbpedia_spotlight(skill, 0.5)
  if link == 0:
    confidence = 0.4
    while confidence >= 0:
      link = annotate_with_Dbpedia_spotlight(skill, confidence)
      confidence = confidence - 0.1
  if link == 0:
    link_list = list(skill_df.query('name == "'+ skill.lower() + '"', inplace = False)["link"])
    if link_list:
      link = link_list[0]
    else:
      link = 0
  return link

In [None]:
def getPeriodsClusters(period_a, period_b):
  period_a_links = []
  period_b_links = []
  skill_link_map_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Knowledge graph files/skill_links_map.csv')
  cluster_detail_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Particle Density Algo/results/mapping-results/clusters_details.csv')

  period_a_clusters = cluster_detail_df.query('TimeLabel == "' + period_a + '"' , inplace=False)["Cluster"]
  for cluster in period_a_clusters:
    cluster = cluster.rstrip('#')
    items_arr = cluster.split('#')
    link_list = []
    for item in items_arr:
      link = skill_link_map_df.query('skill == "' + item + '"', inplace=False)["link"].tolist()[0]
      link_list.append(link)
    period_a_links.append(link_list)

  period_b_clusters = cluster_detail_df.query('TimeLabel == "' + period_b + '"' , inplace=False)["Cluster"]
  for cluster in period_b_clusters:
    cluster = cluster.rstrip('#')
    items_arr = cluster.split('#')
    link_list = []
    for item in items_arr:
      link = skill_link_map_df.query('skill == "' + item + '"', inplace=False)["link"].tolist()[0]
      link_list.append(link)
    period_b_links.append(link_list)
  return period_a_links, period_b_links

In [None]:
 map_skills(['Database', 'Java'])

['http://dbpedia.org/resource/Database', 'http://dbpedia.org/resource/Java']

**Get Dbpedia Triples of all the skills**

In [None]:
from collections import OrderedDict

def getTriples(A): 
  url = 'http://dbpedia.org/sparql/'
  query = """
  SELECT *
  WHERE
  {
    {
    <""" + A + """>  ?r1 ?n2 .
    }
  }
  """
  r = requests.get(url, params = {'format': 'json', 'query': query})
  data = r.json()

  subgraph = []
  for item in data['results']['bindings']:
      if item['n2']['value'].startswith('http://dbpedia.org'):
        subgraph.append(OrderedDict({
          'source_node': A, 
          'r1': item['r1']['value'],
          'target_node': item['n2']['value']
        }))

  df = pd.DataFrame(subgraph)
  return df

In [None]:
def getConnectingRelation(node):
  url = 'http://dbpedia.org/sparql/'
  query = """
  SELECT *
  WHERE
  {
    {
    <""" + node + """>  ?r1 ?n2 .
    }
  }
  """
  r = requests.get(url, params = {'format': 'json', 'query': query})
  data = r.json()

  relations = []
  for item in data['results']['bindings']:
    relations.append(item['r1']['value'])

  relations = list(set(relations))
  return relations

In [None]:
# getConnectingRelation('http://dbpedia.org/resource/Sybase')

In [None]:
def getDbpediaTriplesOfCluster(cluster):
  frames = []
  for item in cluster:
    df = getTriples(item)
    frames.append(df)
  result = pd.concat(frames)
  result = result.drop_duplicates()
  return result

In [None]:
# l1 = [['a', 1, 2], ['a', 3, 4], ['a', 5, 7]]
# l2 = [['b', 1, 2], ['b', 3, 4], ['a', 5, 9]]
# d1 = pd.DataFrame(l1, columns=['name', 'one', 'two'])
# d2 = pd.DataFrame(l2, columns=['name', 'one', 'two'])
# d = pd.merge(d1, d2, how='inner', on=['one', 'two'])
# d = d[['one', 'two']]
# d

In [None]:
def firstRuleOfSimilarity(s1, s2, s1_triples, s2_triples):
  # If two distinct subjects share the same predicate, and for that predicate the same object, then both are given weight as 'similar'
  df1 = s1_triples.query("source_node == '" + s1 + "'")
  df1.drop_duplicates(inplace=True)
  df2 = s2_triples.query("source_node == '" + s2 + "'")
  df2.drop_duplicates(inplace=True)
  score = 0
  # find common [r1,target_node] pairs in df1 and df2
  common_df = pd.merge(df1, df2, how='inner', on=['r1', 'target_node'])
  common_df = common_df[['r1', 'target_node']]
  common_df.drop_duplicates(inplace=True)
  
  if not common_df.empty:
    score = (common_df.shape[0] / (df1.shape[0] + df2.shape[0] - common_df.shape[0]))
    # unique_predicates = list(df.r1.unique())
    # for pred in unique_predicates:
    #   common_rows = df.query("r1 == '" + pred + "'").shape[0]
    #   df1_pred_rows = df1.query("r1 == '" + pred + "'").shape[0]
    #   df2_pred_rows = df2.query("r1 == '" + pred + "'").shape[0]
  return score

In [None]:
# print(firstRuleOfSimilarity('http://dbpedia.org/resource/JavaScript', 'http://dbpedia.org/resource/JavaScript', t1_triples_df, t2_triples_df))

In [None]:
def secondRuleOfSimilarity(s1, s2, s1_triples, s2_triples):
  # If two distinct subjects have similar direct neighbor nodes, 
  # then they are considered similar (for this we can give a threshold for the number of direct neighbor nodes that are similar)
  node_list1 = s1_triples.query("source_node == '" + s1 + "'")["target_node"].tolist()
  node_list2 = s2_triples.query("source_node == '" + s2 + "'")["target_node"].tolist()
  node_set1 = set(node_list1)
  common_items = list(node_set1.intersection(node_list2))
  all_unique_nodes = len(set(node_list1)) + len(set(node_list2))
  if len(common_items) > 0:
    result = len(common_items) / (all_unique_nodes - len(common_items))
    return result
  else:
    return 0

In [None]:
# print(secondRuleOfSimilarity('http://dbpedia.org/resource/JavaScript', 'http://dbpedia.org/resource/ECMAScript', t1_triples_df, t2_triples_df))

In [None]:
def thirdRuleOfSimilarity(s1, s2, s1_triples, s2_triples):
  excluded_predicates_list = ['http://www.w3.org/1999/02/22-rdf-syntax-ns#type']
  relation_list1 = s1_triples.query("source_node == '" + s1 + "'")["r1"].tolist()
  relation_list2 = s2_triples.query("source_node == '" + s2 + "'")["r1"].tolist()
  relation_set1 = set(relation_list1)
  common_relations = list(relation_set1.intersection(relation_list2))
  common_relations = list(set(common_relations) - set(excluded_predicates_list))
  all_unique_relations = set(relation_list1 + relation_list2)
  all_unique_relations = list(set(all_unique_relations) - set(excluded_predicates_list))
  if len(common_relations) > 0:
    result = len(common_relations) / len(all_unique_relations)
    return result
  else:
   return 0

In [None]:
# print(thirdRuleOfSimilarity('http://dbpedia.org/resource/JavaScript', 'http://dbpedia.org/resource/ECMAScript', t1_triples_df, t2_triples_df))

#CODE

In [None]:
def mapClusters(t1_cluster, t2_cluster, t1_triples_df, t2_triples_df):
  score_matrix = []
  for t1_item in t1_cluster:
    row_list = []
    for t2_item in t2_cluster:
      if t1_item != t2_item:
        first_sim_score = firstRuleOfSimilarity(t1_item, t2_item, t1_triples_df, t2_triples_df)
        second_sim_score = secondRuleOfSimilarity(t1_item, t2_item, t1_triples_df, t2_triples_df)
        third_sim_score = thirdRuleOfSimilarity(t1_item, t2_item, t1_triples_df, t2_triples_df)
        # total_sim_score = float("{:.4f}".format((first_sim_score + second_sim_score + third_sim_score)/3))
        total_sim_score = float("{:.4f}".format(0.5 * first_sim_score + 0.3*second_sim_score + 0.2*third_sim_score))
      else:
        total_sim_score = 1
      row_list.append(total_sim_score)
    score_matrix.append(row_list)
  score_matrix = np.array(score_matrix)
  # print(score_matrix)
  # applying hungarian algo to get max similarity score
  row_ind, col_ind = linear_sum_assignment(-score_matrix)
  total_similarity = float("{:.3f}".format(score_matrix[row_ind, col_ind].sum()))
  # print(row_ind, col_ind)
  return total_similarity

#Testing 

t1_cluster = [JavaScript, Python]

t2_cluster = [GitHub, JavaScript]

Score: 0.53


t1_cluster = [JavaScript, Python]

t2_cluster = [Ecmascript, R]

Score: 0.27


JS, ES - 0.23, 0.21, 0.53

Python, R - 0.1, .1, .48





In [None]:
t1_cluster = ['http://dbpedia.org/resource/Python_(programming_language)',
              'http://dbpedia.org/resource/JavaScript',
              'http://dbpedia.org/resource/Java',
              'http://dbpedia.org/resource/Database',
              'http://dbpedia.org/resource/MySQL']
t2_cluster = ['http://dbpedia.org/resource/Java',
              'http://dbpedia.org/resource/Oracle_Database',
              'http://dbpedia.org/resource/JavaScript',
              'http://dbpedia.org/resource/Security',
              'http://dbpedia.org/resource/R_(programming_language)']

t1_triples_df = getDbpediaTriplesOfCluster(t1_cluster)
t2_triples_df = getDbpediaTriplesOfCluster(t2_cluster)

min_cluster_size = min(len(t1_cluster), len(t2_cluster))
# sim_score = mapClusters(t1_cluster, t2_cluster, t1_triples_df, t2_triples_df)
# norm_sim_score = sim_score / min_cluster_size
# norm_sim_score

In [None]:
t1_triples_df.iloc[330:587]

Unnamed: 0,source_node,r1,target_node
330,http://dbpedia.org/resource/Python_(programmin...,http://dbpedia.org/ontology/wikiPageWikiLink,http://dbpedia.org/resource/Symbian
331,http://dbpedia.org/resource/Python_(programmin...,http://dbpedia.org/ontology/wikiPageWikiLink,http://dbpedia.org/resource/Syntactic_sugar
332,http://dbpedia.org/resource/Python_(programmin...,http://dbpedia.org/ontology/wikiPageWikiLink,http://dbpedia.org/resource/Tcl
333,http://dbpedia.org/resource/Python_(programmin...,http://dbpedia.org/ontology/wikiPageWikiLink,http://dbpedia.org/resource/Standard_library
334,http://dbpedia.org/resource/Python_(programmin...,http://dbpedia.org/ontology/wikiPageWikiLink,http://dbpedia.org/resource/Infix_notation
...,...,...,...
71,http://dbpedia.org/resource/JavaScript,http://dbpedia.org/ontology/wikiPageWikiLink,http://dbpedia.org/resource/Tamarin_(software)
72,http://dbpedia.org/resource/JavaScript,http://dbpedia.org/ontology/wikiPageWikiLink,http://dbpedia.org/resource/Web_application
73,http://dbpedia.org/resource/JavaScript,http://dbpedia.org/ontology/wikiPageWikiLink,http://dbpedia.org/resource/High-level_program...
74,http://dbpedia.org/resource/JavaScript,http://dbpedia.org/ontology/wikiPageWikiLink,http://dbpedia.org/resource/Scripting_language


In [None]:
mapClusters(t1_cluster, t2_cluster, t1_triples_df, t2_triples_df)

2.576

In [None]:
print(firstRuleOfSimilarity('http://dbpedia.org/resource/Python_(programming_language)', 'http://dbpedia.org/resource/R_(programming_language)', t1_triples_df, t2_triples_df))
print(secondRuleOfSimilarity('http://dbpedia.org/resource/Python_(programming_language)', 'http://dbpedia.org/resource/R_(programming_language)', t1_triples_df, t2_triples_df))
print(thirdRuleOfSimilarity('http://dbpedia.org/resource/Python_(programming_language)', 'http://dbpedia.org/resource/R_(programming_language)', t1_triples_df, t2_triples_df))

0.09957325746799431
0.11336717428087986
0.47368421052631576


# Mapping

In [None]:
#JulSep2016, OctDec2016, JanMar2017
period_a = "JanMar2017"
period_b = "AprJun2017"
period_a_links, period_b_links = getPeriodsClusters(period_a, period_b)

In [None]:
sim_matrix = []

for t1_cluster in period_a_links:
  sim_series = []
  for t2_cluster in period_b_links:
    t1_triples_df = getDbpediaTriplesOfCluster(t1_cluster)
    t2_triples_df = getDbpediaTriplesOfCluster(t2_cluster)
    print(t1_cluster)
    print(t2_cluster)
    
    min_cluster_size = min(len(t1_cluster), len(t2_cluster))
    sim_score = mapClusters(t1_cluster, t2_cluster, t1_triples_df, t2_triples_df)
    norm_sim_score = float("{:.3f}".format(sim_score / min_cluster_size))
    print(norm_sim_score)
    print(min_cluster_size)
    print("==============")
    sim_series.append(norm_sim_score)
  sim_matrix.append(sim_series)
sim_matrix = np.array(sim_matrix)
print(sim_matrix)

['http://dbpedia.org/resource/ASP.NET', 'http://dbpedia.org/resource/Perl', 'http://dbpedia.org/resource/NoSQL', 'http://dbpedia.org/resource/Groovy_(programming_language)', 'http://dbpedia.org/resource/.NET_Framework', 'http://dbpedia.org/resource/Selenium_(software)', 'http://dbpedia.org/resource/JavaScript', 'http://dbpedia.org/resource/SQL', 'http://dbpedia.org/resource/Ansible', 'http://dbpedia.org/resource/Relational_database', 'http://dbpedia.org/resource/Git']
['http://dbpedia.org/resource/MySQL', 'http://dbpedia.org/resource/AngularJS', 'http://dbpedia.org/resource/Oracle', 'http://dbpedia.org/resource/Database', 'http://dbpedia.org/resource/JSON', 'http://dbpedia.org/resource/PHP', 'http://dbpedia.org/resource/XML', 'http://dbpedia.org/resource/Symfony', 'http://dbpedia.org/resource/HTML', 'http://dbpedia.org/resource/Gradle', 'http://dbpedia.org/resource/Unit_testing', 'http://dbpedia.org/resource/JavaScript']
0.274
11
['http://dbpedia.org/resource/ASP.NET', 'http://dbpedia.

In [None]:
a = [4.260274313648173, 2.470715521736053, 3.9306079892661225, 2.302067008355732, 3.367561565650502,
3.0, 1.0, 3.0,
1.3498491956962968, 2.6338078523750066, 0.2838260668994744, 1.7763223592241961, 2.814083804376451,
3.0, 1.4690731458689448, 2.790897954698911, 2.172939948380468,
4.0, 4.0, 4.0,
2.0, 1.4938994577033764, 2.0, 1.0,
2.466519841349037, 2.467401321708789, 2.0 ]

In [None]:
(100 + 41 + 53 + 70 + 72)/ 5

67.2

In [None]:
(77 + 43 + 74 + 69 + 120 + 23)/ 6

67.66666666666667