In [26]:
import pickle
import sys
sys.path.append("../complexity_hunters/")  # to make utils importable
sys.path.append(".")  # to make utils importable
sys.path.append("..")  # to make utils importable

import utils.data_worker
import utils.consts

from graph.graph import build_graph
import igraph

from complexity_hunters.extra_metrics import sets_iou


In [3]:
build_graph()

INFO: Loading dataset ../data/Posts.xml...
INFO: Loading dataset ../data/Badges.xml...


extracting user pairs based on tags: 100%|██████████████████| 2013/2013 [00:01<00:00, 1686.46it/s]


INFO: Dumped graph into ./data/graph.pkl


In [4]:
graph = pickle.load(open("../data/graph.pkl", "rb"))

In [5]:
def build_partition(graph):
    i_graph = igraph.Graph.from_networkx(graph)
    partition = i_graph.community_leiden(
        objective_function="modularity", n_iterations=1000
    )

    node_to_community = {
        node: partition.membership[i] for i, node in enumerate(i_graph.vs["_nx_name"])
    }
    communities = [set() for _ in range(max(partition.membership) + 1)]
    for node in i_graph.vs["_nx_name"]:
        communities[node_to_community[node]].add(node)

    return node_to_community, communities


In [8]:
user_nodes = [node for node in graph.nodes if graph.nodes[node]["type"] == "user"]
users_to_community, user_communities = build_partition(
    graph.subgraph(user_nodes)
)

user_stereotypes = []
for community in user_communities:
    tags = set()
    for user in community:
        tags |= set(graph.nodes[user]["tags"])

    rates = {tag: 0 for tag in tags}
    for user in community:
        for tag in graph.nodes[user]["tags"]:
            rates[tag] += 1

    for key in rates.keys():
        rates[key] /= len(community)

    user_stereotypes.append(rates)


In [10]:
question_nodes = [node for node in graph.nodes if graph.nodes[node]["type"] == "question"]
question_to_community, question_communities = build_partition(
    graph.subgraph(question_nodes)
)

question_stereotypes = []
for community in question_communities:
    tags = set()
    for question in community:
        tags |= set(graph.nodes[question]["tags"])

    rates = {tag: 0 for tag in tags}
    for question in community:
        for tag in graph.nodes[question]["tags"]:
            rates[tag] += 1

    for key in rates.keys():
        rates[key] /= len(community)

    question_stereotypes.append(rates)


In [16]:
posts = utils.data_worker.posts_fill_na(
    utils.data_worker.load_dataset(
        utils.consts.POSTS_DATA_PATH, debug_slice=True
    )[utils.consts.POST_ESSENTIAL_COLUMNS]
)
posts["Body"] = posts["Body"].apply(utils.data_worker.html_to_str)

INFO: Loading dataset ../data/Posts.xml...


In [24]:
brand_new_question = posts[posts.PostTypeId == 1].sample()

In [25]:
brand_new_question

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,Score,ViewCount,Body,OwnerUserId,Tags,AnswerCount,CommentCount,ParentId
3023,9095,1,12291,75,15167,F# and Scala are both functional programming l...,18,|programming-languages|comparison|functional-p...,4,4,-1


In [37]:
def semi_weighted_sets_iou(set1, set2):
    if len(set1) == 0:
        return 0.0

    weight = 0.0
    for tag in set1:
        if tag in set2:
            weight += set2[tag]
    weight /= len(set1)

    return weight

def weighted_sets_iou(set1, set2):
    if len(set1) + len(set2) == 0:
        return 0.0

    weight = 0.0
    
    intersect = set(set1.keys()) & set(set2.keys())
    union = set(set1.keys()) | set(set2.keys())
    
    for tag in intersect:
        weight += 1 - abs(set1[tag] - set2[tag])
    weight /= len(union)

    return weight

In [51]:
best_qcommunity = 0
best_weight = 0.0

tags = utils.data_worker.extract_tags_from_str(brand_new_question.Tags)[0]

print(tags)

for i in range(len(question_communities)):
    weight = semi_weighted_sets_iou(
        tags,
        question_stereotypes[i]
    )

    if weight > best_weight:
        best_weight = weight
        best_qcommunity = i

print(best_qcommunity, best_weight)
# print(list(question_stereotypes[best_qcommunity].items())[:5])

{'f#', 'comparison', 'functional-programming', 'programming-languages', 'scala'}
3 0.20416666666666666


In [54]:
best_ucommunity = 0
best_weight = 0.0

for i in range(len(user_communities)):
    # print(list(user_stereotypes[i].items())[:min(2, len(user_stereotypes[i]))])
    weight = weighted_sets_iou(
        user_stereotypes[i],
        question_stereotypes[best_qcommunity]
    )

    if weight > best_weight:
        best_weight = weight
        best_ucommunity = i

print(best_ucommunity, best_weight)
print(len(user_communities[best_ucommunity]))

4 0.04200552208835342
160


In [59]:
badges = utils.data_worker.load_dataset(utils.consts.BADGES_DATA_PATH, debug_slice=False)
badges = badges[badges.UserId.isin(posts.OwnerUserId.unique())]



INFO: Loading dataset ../data/Badges.xml...


In [62]:
best_defendant = 0
best_score = 0.0

for user in user_communities[best_ucommunity]:
    answers = posts[posts.OwnerUserId == int(user[1:])]
    answers = answers[answers.PostTypeId == 2]

    avg_score = answers.Score.mean()

    if avg_score > best_score:
        best_score = avg_score
        best_defendant = user

print(
    sets_iou(
        set(badges[badges.UserId == int(best_defendant[1:])].Name),
        set(badges[badges.UserId == int(brand_new_question.OwnerUserId)].Name)
    )
)

0.6046511627906976


  set(badges[badges.UserId == int(brand_new_question.OwnerUserId)].Name)
