In [None]:
# """
#     Random walk generator

#     Author:
#         Zeyu Li <zyli@cs.ucla.edu> or <zeyuli@ucla.edu>

#     Description:
#         Generating random walks on our Uq, Ua, and Q network using NetworkX.


# """

# import os, sys
# import networkx as nx
# import random
# import numpy as np
# import math

# from collections import Counter
# import itertools



# class MetaPathGenerator:
#     """MetaPathGenerator

#     Args:
#         dataset     - the dataset to work on
#         length      - the length of random walks to be generated
#         num_walks   - the number of random walks start from each node
#     """

#     def __init__(self, dataset, length=100, coverage=10000):
#         self._walk_length = length
#         self._coverage = coverage
#         self._dataset = dataset
#         self.G = nx.Graph()

#         self.walks = []
#         self.pairs = []

#         self.initialize()

#     def initialize(self):
#         """ Initialize Graph

#         Initialize graph with Uq-Q pairs and Q-Ua pairs.
#         We use following Uppercase letter

#         Args:
#             QR_file - Input file containing Q-R pairs
#             QA_file - Input file containing Q-A pairs

#         """

#         DATA_DIR = os.getcwd() + "/data/parsed/" + self._dataset + "/"
#         QR_file = DATA_DIR + "Q_R.txt"
#         QA_file = DATA_DIR + "Q_A.txt"
#         G = self.G
#         # Read in Uq-Q pairs
#         with open(QR_file, "r") as fin:
#             lines = fin.readlines()
#             RQ_edge_list = []
#             for line in lines:
#                 unit = line.strip().split()
#                 RQ_edge_list.append(["Q_" + unit[0],
#                                      "R_" + unit[1]])
#             G.add_edges_from(RQ_edge_list)
#         with open(QA_file, "r") as fin:
#             lines = fin.readlines()
#             QA_edge_list = []
#             for line in lines:
#                 unit = line.strip().split()
#                 QA_edge_list.append(["Q_" + unit[0],
#                                      "A_" + unit[1]])
#             G.add_edges_from(QA_edge_list)

#     def get_nodelist(self, type=None):
#         """ Get specific type or all nodes of nodelist in the graph

#         Args:
#             type - The entity type of the entity.
#                    If set as `None`, then all types of nodes would be returned.

#         Return:
#             nodelist - the list of node with `type`
#         """
#         G = self.G

#         if not G.number_of_edges() or not G.number_of_nodes():
#             sys.exit("Graph should be initialized before get_nodelist()!")

#         if not type:
#             return list(G.nodes)
#         return [node for node in list(G.nodes)
#                 if node[0] == type]

#     def generate_metapaths(self, patterns, alpha):
#         """ Generate Random Walk

#         Generating random walk from the Tripartite graph
#         A candidate pattern pool is:
#             "A-Q-R-Q-A": specifies 2 A's answered a question proposed by a same R
#             "A-Q-A": speficies 2 A' answered a same question

#         Args:
#             meta_pattern - the pattern that guides the walk generation
#             alpha - probability of restart

#         Return:
#             walks - a set of generated random walks
#         """
#         G = self.G
#         num_walks, walk_len = self._coverage, self._walk_length
#         rand = random.Random(0)

#         print("Generating Meta-paths ...")

#         if not G.number_of_edges() or not G.number_of_nodes():
#             sys.exit("Graph should be initialized before generate_walks()!")

#         walks = []

#         for meta_pattern in patterns:  # Generate by patterns
#             print("\tNow generating meta-paths from pattern: \"{}\" ..."
#                   .format(meta_pattern))
#             start_entity_type = meta_pattern[0]
#             start_node_list = self.get_nodelist(start_entity_type)
#             for cnt in range(num_walks):  # Iterate the node set for cnt times
#                 print("Count={}".format(cnt))
#                 rand.shuffle(start_node_list)
#                 total = len(start_node_list)                
#                 for ind, start_node in enumerate(start_node_list):
#                     if ind % 3000 == 0:
#                         print("Finished {:.2f}".format(ind/total))

#                     walks.append(
#                         self.__meta_path_walk(
#                             start=start_node,
#                             alpha=alpha,
#                             pattern=meta_pattern))

#         print("Done!")
#         self.walks = walks
#         return

#     def generate_metapaths_2(self):
#         """ Generate Random Walk

#         Generating random walk from the Tripartite graph
#         Args:
#             meta_pattern - the pattern that guides the walk generation
#             alpha - probability of restart

#         Return:
#             walks - a set of generated random walks
#         """
#         G = self.G
#         num_walks, walk_len = self._coverage, self._walk_length
#         rand = random.Random(0)

#         print("Generating Meta-paths ...")

#         if not G.number_of_edges() or not G.number_of_nodes():
#             sys.exit("Graph should be initialized before generate_walks()!")

#         walks = []

#         print("\tNow generating meta-paths from deepwalk ...")
#         start_node_list = self.get_nodelist()
#         for cnt in range(num_walks):  # Iterate the node set for cnt times
#             print("Count={}".format(cnt))
#             rand.shuffle(start_node_list)
#             total = len(start_node_list)
#             for ind, start_node in enumerate(start_node_list):
#                 if ind % 3000 == 0:
#                     print("Finished {:.2f}".format(ind/total))
#                 walks.append(
#                     self.__random_walk(start=start_node))

#         print("Done!")
#         self.walks = walks
#         return

#     def __random_walk(self, start=None):
#         """Single Random Walk Generator

#         Args:
#             rand - an random object to generate random numbers
#             start - starting node

#         Return:
#             walk - the single walk generated
#         """
#         G = self.G
#         rand = random.Random()
#         walk = [start]
#         cur_node = start
#         while len(walk) <= self._walk_length:
#             possible_next_nodes = [neighbor
#                                    for neighbor in G.neighbors(cur_node)]
#             next_node = rand.choice(possible_next_nodes)
#             walk.append(next_node)
#             cur_node = next_node

#         return " ".join(walk)

#     def __meta_path_walk(self, start=None, alpha=0.0, pattern=None):
#         """Single Walk Generator

#         Generating a single random walk that follows a meta path of `pattern`

#         Args:
#             rand - an random object to generate random numbers
#             start - starting node
#             alpha - probability of restarts
#             pattern - (string) the pattern according to which to generate walks
#             walk_len - (int) the length of the generated walk

#         Return:
#             walk - the single walk generated

#         """
#         def type_of(node_id):
#             return node_id[0]

#         rand = random.Random()
#         # Checking pattern is correctly initialized
#         if not pattern:
#             sys.exit("Pattern is not specified when generating meta-path walk")

#         G = self.G
#         n, pat_ind = 1, 1

#         walk = [start]

#         cur_node = start

#         # Generating meta-paths
#         while len(walk) <= self._walk_length or pat_ind != len(pattern):

#             # Updating the pattern index
#             pat_ind = pat_ind if pat_ind != len(pattern) else 1

#             # Decide whether to restart
#             if rand.random() >= alpha:
#                 # Find all possible next neighbors
#                 possible_next_node = [neighbor
#                                       for neighbor in G.neighbors(cur_node)
#                                       if type_of(neighbor) == pattern[pat_ind]]
#                 # Random choose next node
#                 next_node = rand.choice(possible_next_node)
#             else:
#                 next_node = walk[0]

#             walk.append(next_node)
#             cur_node = next_node
#             pat_ind += 1

#         return " ".join(walk)

#     def write_metapaths(self):
#         """Write Metapaths to files

#         Args:
#             walks - The walks generated by `generate_walks`
#         """

#         print("Writing Generated Meta-paths to files ...", end=" ")

#         DATA_DIR = os.getcwd() + "/metapath/"
#         OUTPUT = DATA_DIR + self._dataset + "_" \
#                  + str(self._coverage) + "_" + str(self._walk_length) + ".txt"
#         if not os.path.exists(DATA_DIR):
#             os.mkdir(DATA_DIR)
#         with open(OUTPUT, "w") as fout:
#             for walk in self.walks:
#                 print("{}".format(walk), file=fout)

#         print("Done!")

#     def path_to_pairs(self, window_size):
#         """Convert all metapaths to pairs of nodes

#         Args:
#             walks - all the walks to be translated
#             window_size - the sliding window size
#         Return:
#             pairs - the *shuffled* pair corpus of the dataset
#         """
#         pairs = []
#         if not self.walks:
#             sys.exit("Walks haven't been created.")
#         for walk in self.walks:
#             walk = walk.strip().split(' ')
#             for pos, token in enumerate(walk):
#                 lcontext, rcontext = [], []
#                 lcontext = walk[pos - window_size: pos] \
#                     if pos - window_size >= 0 \
#                     else walk[:pos]

#                 if pos + 1 < len(walk):
#                     rcontext = walk[pos + 1: pos + window_size] \
#                         if pos + window_size < len(walk) \
#                         else walk[pos + 1:]

#                 context_pairs = [[token, context]
#                                  for context in lcontext + rcontext]
#                 pairs += context_pairs
#         np.random.shuffle(pairs)
#         self.pairs = pairs
#         return

#     def write_pairs(self):
#         """Write all pairs to files
#         Args:
#             pairs - the corpus
#         Return:
#         """
#         print("Writing Generated Pairs to files ...")
#         DATA_DIR = os.getcwd() + "/corpus/"
#         OUTPUT = DATA_DIR + self._dataset + "_" + \
#                  str(self._coverage) + "_" + str(self._walk_length) + ".txt"
#         if not os.path.exists(DATA_DIR):
#             os.mkdir(DATA_DIR)
#         with open(OUTPUT, "w") as fout:
#             for pair in self.pairs:
#                 print("{} {}".format(pair[0], pair[1]), file=fout)
#         return

#     def down_sample(self):
#         """Down sampling the training sets
        
#         1. Remove all the duplicate tuples such as "A_11 A_11"
#         2. Take log of all tuples as a down sampling
#         """

#         pairs = self.pairs
#         pairs = [(pair[0], pair[1])
#                  for pair in pairs
#                  if pair[0] != pair[1]]
#         cnt = Counter(pairs)
#         down_cnt = [[pair] * math.ceil(math.log(count))
#                     for pair, count in cnt.items()]
#         self.pairs = list(itertools.chain(*down_cnt))
#         np.random.shuffle(self.pairs)








In [1]:
def random_choice(seq, prob):


    p = random.random()
    for i in range(len(seq)):
        if sum(prob[:i]) < p <= sum(prob[:i+1]):
            res=seq[i]
    return res

In [2]:
import os, sys
import networkx as nx
import random
import numpy as np
import math

from collections import Counter
import itertools



class MetaPathGenerator:
    """MetaPathGenerator

    Args:
        dataset     - the dataset to work on
        length      - the length of random walks to be generated
        num_walks   - the number of random walks start from each node
    """

    def __init__(self, dataset, length=100, coverage=10000):
        self._walk_length = length
        self._coverage = coverage
        self._dataset = dataset
        self.G = nx.Graph()

        self.walks = []
        self.pairs = []

        self.initialize()
    
    def initialize(self):
        """ Initialize Graph

        Initialize graph with Uq-Q pairs and Q-Ua pairs.
        We use following Uppercase letter

        Args:
            QR_file - Input file containing Q-R pairs
            QA_file - Input file containing Q-A pairs

        """

        DATA_DIR = os.getcwd() + "/data/parsed/" + self._dataset + "/"
        QR_file = DATA_DIR + "Q_R.txt"
        QA_file = DATA_DIR + "Q_A.txt"
        G = self.G

        # Read in Uq-Q pairs
        with open(QR_file, "r") as fin:
            lines = fin.readlines()
            RQ_edge_list = []
            for line in lines:
                unit = line.strip().split()
                RQ_edge_list.append(["Q_" + unit[0],
                                     "R_" + unit[1]])
            G.add_edges_from(RQ_edge_list)
        with open(QA_file, "r") as fin:
            lines = fin.readlines()
            QA_edge_list = []
            for line in lines:
                unit = line.strip().split()
                QA_edge_list.append(["Q_" + unit[0],
                                     "A_" + unit[1]])
                if int(unit[2])>=0:
                    G.add_edge("Q_" + unit[0],
                            "A_" + unit[1],weight= int(unit[2])+1)
                    #votes[unit[1]]=int(unit[2])+1
                    #e_expert[]
                else:
                     G.add_edge("Q_" + unit[0],
                            "A_" + unit[1],weight = int(unit[2]))
            #G.add_edges_from(QA_edge_list,weight=)

    def get_nodelist(self, type=None):
        """ Get specific type or all nodes of nodelist in the graph

        Args:
            type - The entity type of the entity.
                   If set as `None`, then all types of nodes would be returned.

        Return:
            nodelist - the list of node with `type`
        """
        G = self.G

        if not G.number_of_edges() or not G.number_of_nodes():
            sys.exit("Graph should be initialized before get_nodelist()!")

        if not type:
            return list(G.nodes)
        return [node for node in list(G.nodes)
                if node[0] == type]

    
    
    
    def generate_metapaths(self, patterns, alpha):
        """ Generate Random Walk

        Generating random walk from the Tripartite graph
        A candidate pattern pool is:
            "A-Q-R-Q-A": specifies 2 A's answered a question proposed by a same R
            "A-Q-A": speficies 2 A' answered a same question

        Args:
            meta_pattern - the pattern that guides the walk generation
            alpha - probability of restart

        Return:
            walks - a set of generated random walks
        """
        G = self.G
        num_walks, walk_len = self._coverage, self._walk_length
        rand = random.Random(0)

        print("Generating Meta-paths ...")

        if not G.number_of_edges() or not G.number_of_nodes():
            sys.exit("Graph should be initialized before generate_walks()!")

        walks = []

        for meta_pattern in patterns:  # Generate by patterns
            print("\tNow generating meta-paths from pattern: \"{}\" ..."
                  .format(meta_pattern))
            start_entity_type = meta_pattern[0]
            start_node_list = self.get_nodelist(start_entity_type)
            for cnt in range(num_walks):  # Iterate the node set for cnt times
                print("Count={}".format(cnt))
                rand.shuffle(start_node_list)
                total = len(start_node_list)                
                for ind, start_node in enumerate(start_node_list):
                    if ind % 3000 == 0:
                        print("Finished {:.2f}".format(ind/total))

                    walks.append(
                        self.__meta_path_walk(
                            start=start_node,
                            alpha=alpha,
                            pattern=meta_pattern))

        print("Done!")
        self.walks = walks
        return
    
    
    
    
    def generate_expertisemetapaths(self, patterns, alpha):
        """ Generate Random Walk

        Generating random walk from the Tripartite graph
        A candidate pattern pool is:
            "A-Q-R-Q-A": specifies 2 A's answered a question proposed by a same R
            "A-Q-A": speficies 2 A' answered a same question

        Args:
            meta_pattern - the pattern that guides the walk generation
            alpha - probability of restart

        Return:
            walks - a set of generated random walks
        """
        G = self.G
        num_walks, walk_len = self._coverage, self._walk_length
        rand = random.Random(0)

        print("Generating Meta-paths ...")

        if not G.number_of_edges() or not G.number_of_nodes():
            sys.exit("Graph should be initialized before generate_walks()!")

        walks = []

        for meta_pattern in patterns:  # Generate by patterns
            print("\tNow generating meta-paths from pattern: \"{}\" ..."
                  .format(meta_pattern))
            start_entity_type = meta_pattern[0]
            start_node_list = self.get_nodelist(start_entity_type)
            for cnt in range(num_walks):  # Iterate the node set for cnt times
                print("Count={}".format(cnt))
                rand.shuffle(start_node_list)
                total = len(start_node_list)                
                for ind, start_node in enumerate(start_node_list):
                    if ind % 3000 == 0:
                        print("Finished {:.2f}".format(ind/total))

                    walks.append(
                        self.__metaexpertise_path_walk(
                            start=start_node,
                            alpha=alpha,
                            pattern=meta_pattern))

        print("Done!")
        self.walks = walks
        return

    def generate_metapaths_2(self):
        """ Generate Random Walk

        Generating random walk from the Tripartite graph
        Args:
            meta_pattern - the pattern that guides the walk generation
            alpha - probability of restart

        Return:
            walks - a set of generated random walks
        """
        G = self.G
        num_walks, walk_len = self._coverage, self._walk_length
        rand = random.Random(0)

        print("Generating Meta-paths ...")

        if not G.number_of_edges() or not G.number_of_nodes():
            sys.exit("Graph should be initialized before generate_walks()!")

        walks = []

        print("\tNow generating meta-paths from deepwalk ...")
        start_node_list = self.get_nodelist()
        for cnt in range(num_walks):  # Iterate the node set for cnt times
            print("Count={}".format(cnt))
            rand.shuffle(start_node_list)
            total = len(start_node_list)
            for ind, start_node in enumerate(start_node_list):
                if ind % 3000 == 0:
                    print("Finished {:.2f}".format(ind/total))
                walks.append(
                    self.__random_walk(start=start_node))

        print("Done!")
        self.walks = walks
        return

    def __random_walk(self, start=None):
        """Single Random Walk Generator

        Args:
            rand - an random object to generate random numbers
            start - starting node

        Return:
            walk - the single walk generated
        """
        G = self.G
        rand = random.Random()
        walk = [start]
        cur_node = start
        while len(walk) <= self._walk_length:
            possible_next_nodes = [neighbor
                                   for neighbor in G.neighbors(cur_node)]
            next_node = rand.choice(possible_next_nodes)
            walk.append(next_node)
            cur_node = next_node

        return " ".join(walk)
    
    

    
    

    def __meta_path_walk(self, start=None, alpha=0.0, pattern=None):
        """Single Walk Generator

        Generating a single random walk that follows a meta path of `pattern`

        Args:
            rand - an random object to generate random numbers
            start - starting node
            alpha - probability of restarts
            pattern - (string) the pattern according to which to generate walks
            walk_len - (int) the length of the generated walk

        Return:
            walk - the single walk generated

        """
        def type_of(node_id):
            return node_id[0]

        rand = random.Random()
        # Checking pattern is correctly initialized
        if not pattern:
            sys.exit("Pattern is not specified when generating meta-path walk")

        G = self.G
        n, pat_ind = 1, 1

        walk = [start]

        cur_node = start

        # Generating meta-paths
        while len(walk) <= self._walk_length or pat_ind != len(pattern):

            # Updating the pattern index
            pat_ind = pat_ind if pat_ind != len(pattern) else 1

            # Decide whether to restart
            if rand.random() >= alpha:
                # Find all possible next neighbors
                possible_next_node = [neighbor
                                      for neighbor in G.neighbors(cur_node)
                                      if type_of(neighbor) == pattern[pat_ind]]
                # Random choose next node
                
                next_node = rand.choice(possible_next_node)
            else:
                next_node = walk[0]

            walk.append(next_node)
            cur_node = next_node
            pat_ind += 1

        return " ".join(walk)
    
    
    
    
    
    
    
    
    
    def __metaexpertise_path_walk(self, start=None, alpha=0.0, pattern=None):
        """Single Walk Generator

        Generating a single random walk that follows a meta path of `pattern`

        Args:
            rand - an random object to generate random numbers
            start - starting node
            alpha - probability of restarts
            pattern - (string) the pattern according to which to generate walks
            walk_len - (int) the length of the generated walk

        Return:
            walk - the single walk generated

        """
        def type_of(node_id):
            return node_id[0]

        rand = random.Random()
        # Checking pattern is correctly initialized
        if not pattern:
            sys.exit("Pattern is not specified when generating meta-path walk")

        G = self.G
        n, pat_ind = 1, 1

        walk = [start]

        cur_node = start

        # Generating meta-paths
        while len(walk) <= self._walk_length or pat_ind != len(pattern):

            # Updating the pattern index
            pat_ind = pat_ind if pat_ind != len(pattern) else 1

            # Decide whether to restart
            if rand.random() >= alpha:
                # Find all possible next neighbors
                #print("1",cur_node)
                possible_next_node = [neighbor
                                      for neighbor in G.neighbors(cur_node)
                                      if type_of(neighbor) == pattern[pat_ind]]
                
                # Random choose next node
                if cur_node[0]=="Q" and pattern[pat_ind]=="A":
                    nextnodes=list()
                    pro=list()
                    s=list()
                    for i in possible_next_node:
                        s.append(G[cur_node][i]['weight'])
                    #print("sssss",s)
                    if max(s)>0:
                        #print("sssssssssss",s)
                        for i in possible_next_node:


                            if G[cur_node][i]['weight']>0:
                                nextnodes.append(i)
                                pro.append(G[cur_node][i]['weight'])

                        pro=np.array(np.array(pro)/sum(pro))

                        next_node = random_choice(nextnodes,pro)
                    else:
                        next_node = rand.choice(possible_next_node)
                            
                #else:
                #    next_node = rand.choice(possible_next_node)
                    
                    
                elif cur_node[0]=="A" and pattern[pat_ind]=="Q":
                    nextnodes=list()
                    pro=list()
                    s=list()
                    
                    for i in possible_next_node:
                        s.append(G[cur_node][i]['weight'])
                    if max(s)>0:

                        for i in possible_next_node:


                            if G[cur_node][i]['weight']>0:
                                nextnodes.append(i)
                                pro.append(G[cur_node][i]['weight'])

                        pro=np.array(np.array(pro)/sum(pro))

                        next_node = random_choice(nextnodes,pro)
                    else:
                        next_node = rand.choice(possible_next_node)
                            
                else:
                    next_node = rand.choice(possible_next_node)      
                    
                    
                    
                    
                    
                    
                    
            else:
                next_node = walk[0]

            walk.append(next_node)
            cur_node = next_node
            pat_ind += 1

        return " ".join(walk)
    
    
    
    
    
    
    
    
    
    
    def random_choice(self,seq, prob):

       
        p = random.random()
        for i in range(len(seq)):
            if sum(prob[:i]) < p <= sum(prob[:i+1]):
                res=seq[i]
        return res
    
    
    
    
    
    
    
    
    
    
    
    

    def write_metapaths(self):
        """Write Metapaths to files

        Args:
            walks - The walks generated by `generate_walks`
        """

        print("Writing Generated Meta-paths to files ...", end=" ")

        DATA_DIR = os.getcwd() + "/metapath/"
        OUTPUT = DATA_DIR + self._dataset + "_" \
                 + str(self._coverage) + "_" + str(self._walk_length) + ".txt"
        if not os.path.exists(DATA_DIR):
            os.mkdir(DATA_DIR)
        with open(OUTPUT, "w") as fout:
            for walk in self.walks:
                print("{}".format(walk), file=fout)

        print("Done!")
        
        
        
        
    def write_expertmetapaths(self):
        """Write Metapaths to files

        Args:
            walks - The walks generated by `generate_walks`
        """

        print("Writing Generated Meta-paths to files ...", end=" ")

        DATA_DIR = os.getcwd() + "/metapath/"
        OUTPUT = DATA_DIR + self._dataset + "_" \
                 + str(self._coverage) + "_" + str(self._walk_length) + "e"+".txt"
        if not os.path.exists(DATA_DIR):
            os.mkdir(DATA_DIR)
        with open(OUTPUT, "w") as fout:
            for walk in self.walks:
                print("{}".format(walk), file=fout)

        print("Done!")


    def path_to_pairs(self, window_size):
        """Convert all metapaths to pairs of nodes

        Args:
            walks - all the walks to be translated
            window_size - the sliding window size
        Return:
            pairs - the *shuffled* pair corpus of the dataset
        """
        pairs = []
        if not self.walks:
            sys.exit("Walks haven't been created.")
        for walk in self.walks:
            walk = walk.strip().split(' ')
            for pos, token in enumerate(walk):
                lcontext, rcontext = [], []
                lcontext = walk[pos - window_size: pos] \
                    if pos - window_size >= 0 \
                    else walk[:pos]

                if pos + 1 < len(walk):
                    rcontext = walk[pos + 1: pos + window_size] \
                        if pos + window_size < len(walk) \
                        else walk[pos + 1:]

                context_pairs = [[token, context]
                                 for context in lcontext + rcontext]
                pairs += context_pairs
        np.random.shuffle(pairs)
        self.pairs = pairs
        return

    def write_pairs(self):
        """Write all pairs to files
        Args:
            pairs - the corpus
        Return:
        """
        print("Writing Generated Pairs to files ...")
        DATA_DIR = os.getcwd() + "/corpus/"
        OUTPUT = DATA_DIR + self._dataset + "_" + \
                 str(self._coverage) + "_" + str(self._walk_length) + ".txt"
        if not os.path.exists(DATA_DIR):
            os.mkdir(DATA_DIR)
        with open(OUTPUT, "w") as fout:
            for pair in self.pairs:
                print("{} {}".format(pair[0], pair[1]), file=fout)
        return
    
    
    def write_epairs(self):
        """Write all pairs to files
        Args:
            pairs - the corpus
        Return:
        """
        print("Writing Generated Pairs to files ...")
        DATA_DIR = os.getcwd() + "/corpus/"
        OUTPUT = DATA_DIR + self._dataset + "_" + \
                 str(self._coverage) + "_" + str(self._walk_length) + "e"+".txt"
        if not os.path.exists(DATA_DIR):
            os.mkdir(DATA_DIR)
        with open(OUTPUT, "w") as fout:
            for pair in self.pairs:
                print("{} {}".format(pair[0], pair[1]), file=fout)
        return

    

    
    
    def down_sample(self):
        """Down sampling the training sets
        
        1. Remove all the duplicate tuples such as "A_11 A_11"
        2. Take log of all tuples as a down sampling
        """

        pairs = self.pairs
        pairs = [(pair[0], pair[1])
                 for pair in pairs
                 if pair[0] != pair[1]]
        cnt = Counter(pairs)
        down_cnt = [[pair] * math.ceil(math.log(count))
                    for pair, count in cnt.items()]
        self.pairs = list(itertools.chain(*down_cnt))
        np.random.shuffle(self.pairs)

In [3]:
if __name__ == "__main__":

    dataset='Biology'
    gw = MetaPathGenerator(length=5, coverage=5, dataset=dataset)

    # Uncomment the first line for metapath-based
    gw.generate_metapaths(patterns=["RQAQR"], alpha=0)
    #gw.generate_metapaths_2()
    gw.path_to_pairs(window_size=5)
    gw.down_sample()
    gw.write_metapaths()
    gw.write_pairs()

Generating Meta-paths ...
	Now generating meta-paths from pattern: "RQAQR" ...
Count=0
Finished 0.00
Finished 0.66
Count=1
Finished 0.00
Finished 0.66
Count=2
Finished 0.00
Finished 0.66
Count=3
Finished 0.00
Finished 0.66
Count=4
Finished 0.00
Finished 0.66
Done!
Writing Generated Meta-paths to files ... Done!
Writing Generated Pairs to files ...


In [None]:
if __name__ == "__main__":

    dataset='Biology'
    gw = MetaPathGenerator(length=5, coverage=5, dataset=dataset)

    # Uncomment the first line for metapath-based
    gw.generate_expertisemetapaths(patterns=["RQAQR"], alpha=0)
    #gw.generate_metapaths_2()
    gw.path_to_pairs(window_size=5)
    gw.down_sample()
    gw.write_expertmetapaths()
    gw.write_epairs()

Generating Meta-paths ...
	Now generating meta-paths from pattern: "RQAQR" ...
Count=0
Finished 0.00
Finished 0.66
Count=1
Finished 0.00
Finished 0.66
Count=2
Finished 0.00
Finished 0.66
Count=3
Finished 0.00
Finished 0.66


In [1]:
from data_loader import DataLoader

In [2]:
dl = DataLoader(dataset="Biology"
                     , ID=id
                     , include_content=0
                     , coverage=5
                     , length=5
                     , answer_sample_ratio=3
                     )

Initializing data_loader ...
	Loading dataset .../home/dutir/qianlingfei/network-embedding-er/corpus/Biology_5_5.txt
	Counting dataset ...
	Initializing sample table ...
	Loading word2vec model ...
	Loading questions text ...
	Creating user-index mapping ...
data_loader: user_count 6342
	Loading rqa ...
	Creating qid embeddings map ...
	Loading test sets ...
Done - Data Loader!


In [3]:
u, v,upos, vpos, npos, aqr, accqr,point_wise, point=dl.get_train_batch(
                        batch_size=100,
                        neg_ratio=3)

----- 0.3181818181818182


In [5]:
point

array([1.        , 2.55555556, 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.16666667, 1.        , 0.5       , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 1.        , 2.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 1.        , 0.        , 0.        ,
       0.        ])

In [6]:
point_wise

array([[  129,    68,   753],
       [  129,   107,   753],
       [  129,    68,   753],
       [  129,   752,   753],
       [  129, 19454,   753],
       [  129,  1202,   753],
       [  129,  3059,   753],
       [  129, 35570,   753],
       [  129, 16917,   753],
       [ 4108, 14849, 19692],
       [ 4108, 11829, 19692],
       [ 4108, 14849, 19692],
       [ 4108, 23053, 19692],
       [ 4108, 13266, 19692],
       [ 4108,  3444, 19692],
       [ 4108, 29777, 19692],
       [ 4108, 30418, 19692],
       [ 4108,  8598, 19692],
       [ 4108, 35811, 19692],
       [ 4108, 32974, 19692],
       [ 4108, 42293, 19692],
       [ 4108,  9054, 19692],
       [  389,  3632,  3279],
       [  389,  3632,  3279],
       [  389,  1305,  3279],
       [  389,  3536,  3279],
       [  389, 41761,  3279],
       [  389,  9358,  3279],
       [  389, 30418,  3279],
       [  389, 28096,  3279],
       [  389, 24565,  3279],
       [  491,   107,  1018],
       [  491,   107,  1018],
       [  

In [7]:
# if __name__ == "__main__":

#     dataset='Biology'
#     gw = MetaPathGenerator(length=5, coverage=5, dataset=dataset)

#     # Uncomment the first line for metapath-based
#     gw.generate_expertisemetapaths(patterns=["RQAQR"], alpha=0)
#     #gw.generate_metapaths_2()
#     gw.path_to_pairs(window_size=5)
#     gw.down_sample()
#     gw.write_expertmetapaths()
#     gw.write_epairs()

Generating Meta-paths ...
	Now generating meta-paths from pattern: "RQAQR" ...
Count=0
Finished 0.00
Finished 0.66
Count=1
Finished 0.00
Finished 0.66
Count=2
Finished 0.00
Finished 0.66
Count=3
Finished 0.00
Finished 0.66
Count=4
Finished 0.00
Finished 0.66
Done!
Writing Generated Meta-paths to files ... Done!
Writing Generated Pairs to files ...


In [8]:
# if __name__ == "__main__":

#     dataset='Biology'
#     gw = MetaPathGenerator(length=5, coverage=5, dataset=dataset)

#     # Uncomment the first line for metapath-based
#     #gw.generate_metapaths(patterns=["RQAQR"], alpha=0)
#     gw.generate_metapaths_2()
#     gw.path_to_pairs(window_size=5)
#     gw.down_sample()
#     gw.write_expertmetapaths()
#     gw.write_epairs()

In [110]:
if max([-1,-2])>0:
    print(1)

In [17]:
gw = MetaPathGenerator(length=5, coverage=5, dataset=dataset)

In [80]:
np.array([12,3,3])/5

array([2.4, 0.6, 0.6])

In [10]:
# if __name__ == "__main__":

#     dataset='English'
#     gw = MetaPathGenerator(length=5, coverage=5, dataset=dataset)

#     # Uncomment the first line for metapath-based
#     gw.generate_metapaths(patterns=["QSASQ"], alpha=0)
#     #gw.generate_metapaths_2()
#     gw.path_to_pairs(window_size=5)
#     gw.down_sample()
#     gw.write_metapaths()
#     gw.write_pairs()

Generating Meta-paths ...
	Now generating meta-paths from pattern: "QSASQ" ...
Count=0
Finished 0.00
Count=1
Finished 0.00
Count=2
Finished 0.00
Count=3
Finished 0.00
Count=4
Finished 0.00
Done!
Writing Generated Meta-paths to files ... Done!
Writing Generated Pairs to files ...


In [12]:
if __name__ == "__main__":

    dataset='English'
    gw = MetaPathGenerator(length=5, coverage=5, dataset=dataset)

    # Uncomment the first line for metapath-based
    gw.generate_metapaths(patterns=["SQRQS"], alpha=0)
    #gw.generate_metapaths_2()
    gw.path_to_pairs(window_size=5)
    gw.down_sample()
    gw.write_metapaths()
    gw.write_pairs()

Generating Meta-paths ...
	Now generating meta-paths from pattern: "SQRQS" ...
Count=0
Finished 0.00
Count=1
Finished 0.00
Count=2
Finished 0.00
Count=3
Finished 0.00
Count=4
Finished 0.00
Done!
Writing Generated Meta-paths to files ... Done!
Writing Generated Pairs to files ...


In [45]:
pairs = []
window_size=5
if not gw.walks:
    sys.exit("Walks haven't been created.")
for walk in gw.walks:
    walk = walk.strip().split(' ')
    for pos, token in enumerate(walk):
        lcontext, rcontext = [], []
        lcontext = walk[pos - window_size: pos] \
            if pos - window_size >= 0 \
            else walk[:pos]

        if pos + 1 < len(walk):
            rcontext = walk[pos + 1: pos + window_size] \
                if pos + window_size < len(walk) \
                else walk[pos + 1:]

        context_pairs = [[token, context]
                         for context in lcontext + rcontext]
        pairs += context_pairs
#np.random.shuffle(pairs)
pairs = pairs

In [46]:
pairs

[['A_184706', 'Q_9371'],
 ['A_184706', 'R_61075'],
 ['A_184706', 'Q_9371'],
 ['A_184706', 'S_9374'],
 ['Q_9371', 'A_184706'],
 ['Q_9371', 'R_61075'],
 ['Q_9371', 'Q_9371'],
 ['Q_9371', 'S_9374'],
 ['Q_9371', 'Q_9371'],
 ['R_61075', 'A_184706'],
 ['R_61075', 'Q_9371'],
 ['R_61075', 'Q_9371'],
 ['R_61075', 'S_9374'],
 ['R_61075', 'Q_9371'],
 ['R_61075', 'R_61075'],
 ['Q_9371', 'A_184706'],
 ['Q_9371', 'Q_9371'],
 ['Q_9371', 'R_61075'],
 ['Q_9371', 'S_9374'],
 ['Q_9371', 'Q_9371'],
 ['Q_9371', 'R_61075'],
 ['Q_9371', 'Q_9386'],
 ['S_9374', 'A_184706'],
 ['S_9374', 'Q_9371'],
 ['S_9374', 'R_61075'],
 ['S_9374', 'Q_9371'],
 ['S_9374', 'Q_9371'],
 ['S_9374', 'R_61075'],
 ['S_9374', 'Q_9386'],
 ['S_9374', 'S_9387'],
 ['Q_9371', 'A_184706'],
 ['Q_9371', 'Q_9371'],
 ['Q_9371', 'R_61075'],
 ['Q_9371', 'Q_9371'],
 ['Q_9371', 'S_9374'],
 ['Q_9371', 'R_61075'],
 ['Q_9371', 'Q_9386'],
 ['Q_9371', 'S_9387'],
 ['R_61075', 'Q_9371'],
 ['R_61075', 'R_61075'],
 ['R_61075', 'Q_9371'],
 ['R_61075', 'S_9374

In [48]:
from collections import Counter
Counter([1,1,1,2,3,4])

Counter({1: 3, 2: 1, 3: 1, 4: 1})

In [49]:
2e7

20000000.0

In [None]:
a=Counter({1: 3, 2: 1, 3: 1, 4: 1})
count = [ele[1] for ele in self.count]
pow_freq = np.array(count) ** 0.75
ratio = pow_freq / sum(pow_freq)
table_size = 2e7 # todo: what is this???
count = np.round(ratio * table_size).astype(np.int64)
sample_table = []

for i in range(len(self.count)):
    sample_table += [self.count[i][0]] * count[i]

In [55]:
b=Counter({1: 3, 2: 1, 3: 1, 4: 1})

In [57]:
a=Counter.most_common(b)

In [58]:
a

[(1, 3), (2, 1), (3, 1), (4, 1)]

In [59]:
sample_table = []
count=[1,3,4,5]
for i in range(len(a)):
    sample_table += [a[i][0]] * count[i]

In [54]:
a[1]

3

In [60]:
sample_table

[1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4]

In [62]:
i=1
[a[i][0]] * count[i]

[2, 2, 2]

In [65]:
[1]*2

[1, 1]

In [68]:
count_dict = {}
counter = Counter()
for line in gw.walks:
    line = line.strip().split(" ")
    counter.update(line)
a=counter.most_common()

In [79]:

def __init_sample_table(a):
    """
    create sample tables by p()^(3/4)

    return:
        (sample_table)  -  the created sample table
    """
    count = [ele[1] for ele in a]
    pow_freq = np.array(count) ** 0.75
    ratio = pow_freq / sum(pow_freq)
    table_size = 200000 # todo: what is this???
    count = np.round(ratio * table_size).astype(np.int64)
    sample_table = []

    for i in range(len(a)):
        sample_table += [a[i][0]] * count[i]
    return np.array(sample_table)

In [83]:
zip(*[[1,3],[2,3],[4,5]])

<zip at 0x7f10f8f2c6e0>

In [84]:
u,v=zip(*[[1,3],[2,3],[4,5]])

In [85]:
u

(1, 2, 4)

In [94]:
D = {"A": 1, "Q": 2, "R": 0,"S":3}
sep = np.zeros(shape=(4, len(entity_seq)))
for index, item in enumerate(entity_seq):
    split = item.split("_")
    ent_type, ent_id = D[split[0]], int(split[1])
    sep[ent_type][index] = ent_id
#return sep.astype(np.int64)

In [97]:
sep.astype(np.int64).shape

(4, 9)

In [98]:
sep

array([[     0.,      0.,  61075.,      0.,      0.,      0.,  61075.,
             0.,      0.],
       [184706.,      0.,      0.,      0.,      0.,      0.,      0.,
             0.,      0.],
       [     0.,   9371.,      0.,   9371.,      0.,   9371.,      0.,
          9386.,      0.],
       [     0.,      0.,      0.,      0.,   9374.,      0.,      0.,
             0.,   9387.]])

In [88]:
a=gw.walks[0].split(' ')

In [89]:
a

['A_184706',
 'Q_9371',
 'R_61075',
 'Q_9371',
 'S_9374',
 'Q_9371',
 'R_61075',
 'Q_9386',
 'S_9387']

In [90]:
entity_seq=a

In [109]:
aid_samples=list(["111"])
more_ans = np.random.choice(a, replace=False,
                            size=16 - len(a))
aid_samples += list(more_ans) 

In [110]:
aid_samples

['111',
 'S_9387',
 'Q_9371',
 'Q_9371',
 'Q_9386',
 'R_61075',
 'R_61075',
 'A_184706']

In [113]:
if a:
    print(111)

In [112]:
a=0

In [118]:
len( [[0.0] * 300 for _ in range(200)][0])

300

In [119]:
a=Counter({1: 3, 2: 1, 3: 1, 4: 1})

In [122]:
a

Counter({1: 3, 2: 1, 3: 1, 4: 1})

In [123]:
q=[1,2,3]

In [128]:
if q:
    a[q].tolist()

TypeError: unhashable type: 'list'

In [133]:
v=list(['a','b','c'])

In [134]:
question='a b c a a b fd s'

In [135]:
question = [x for x in question.strip().split(" ")
              if x in v]


In [136]:
question

['a', 'b', 'c', 'a', 'a', 'b']

In [None]:
if question:
    qvecs = self.w2vmodel[question].tolist()
    q_len = len(question)

In [137]:
import gensim
PATH = "GoogleNews-vectors-negative300.bin"
model = gensim.models.KeyedVectors.load_word2vec_format(
    fname=PATH, binary=True)


In [141]:
question=['the','panda']

In [152]:
type((model[question]).tolist()[0])

list

In [1]:
a=[1,2,3]

In [2]:
a[:1]

[1]

In [3]:
import torch
import torch
from torch.autograd import Variable

In [10]:
qvlen = Variable(torch.LongTensor(20))

In [11]:
qvlen

tensor([    139874355735712,     139874355735712,                   0,
                          0,                   1,      93854824136920,
                          1,      93854824136576, 2314885530279477276,
            139874271168688,                   2,     139872584770305,
                          1,         -4294967268,     139874326054448,
                -4294967295,     139872585295664,         34359738369,
                          1,      93854824136824])

In [14]:
qulen = qvlen.unsqueeze(1).expand(-1, 256).unsqueeze(1)

In [15]:
qulen.shape

torch.Size([20, 1, 256])

In [18]:
import torch
input = [
    [2, 3, 4, 5, 0, 0],
    [1, 4, 3, 0, 0, 0],
    [4, 2, 2, 5, 7, 0],
    [1, 0, 0, 0, 0, 0]
]
input = torch.tensor(input)
#注意index的类型
length = torch.LongTensor([[4,1],[3,1],[5,1],[1,1]])
#index之所以减1,是因为序列维度是从0开始计算的
out = torch.gather(input, 1, length-1)
out

tensor([[5, 2],
        [3, 1],
        [7, 4],
        [1, 1]])

In [19]:
length.shape

torch.Size([4, 2])