In [1]:
import pandas as pd
import networkx as nx
import numpy as np

from user_likelihood_metrics import sparse_user_tags_likelihood, _make_user_to_answers, sparse_user_answers_likelihood

In [2]:
df = pd.read_csv("../data/cache/prepared_questions_metrics.csv")
answers = pd.read_csv("../data/cache/prepared_answers_metrics.csv", low_memory=False)
df.sample(3)

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastEditorUserId,LastEditDate,...,ContentLicense,ParentId,OwnerDisplayName,LastEditorDisplayName,FavoriteCount,words_cnt,tech_words_cnt,negative_answers,is_dummy,is_complex
44294,350784,1,,2017-06-13T15:19:49.400,0,158.0,"<p>I work for a small startup, &lt; 10 employe...",134846.0,,,...,CC BY-SA 3.0,,,,,112,0,0,True,False
51046,384349,1,384408.0,2018-12-20T13:21:48.053,-1,185.0,<p>Im confuesd about the usage of <code>std::m...,300368.0,300368.0,2018-12-21T08:46:57.880,...,CC BY-SA 4.0,,,,,441,18,0,True,False
1489,28174,1,28175.0,2010-12-20T06:33:42.590,5,269.0,<p>I'm diving into web development after ten y...,7158.0,7158.0,2010-12-20T06:50:34.860,...,CC BY-SA 2.5,,,,,235,0,0,True,False


In [3]:
df.AcceptedAnswerId.isna().sum(), df.shape[0]

(26935, 63423)

In [4]:
min_ = df.Score.min()
max_ = df.Score.max()
min_, max_  # max is too high

(-11, 2182)

In [5]:
mean = df["Score"].mean()
std = df["Score"].std()

df_filtered = df[(df["Score"] >= mean - 3 * std) & (df["Score"] <= mean + 3 * std)]

min_f = df_filtered.Score.min()
max_f = df_filtered.Score.max()
min_f, max_f

(-11, 81)

In [6]:
df = df_filtered
answers = answers[answers["ParentId"].isin(df.Id)]
df.shape, answers.shape

((62665, 27), (161867, 22))

In [7]:
df = df.dropna(subset=["Tags"])
tag_series = df["Tags"]
tags = [str(tags).split("|")[1:-1] for tags in tag_series]
df["Tags"] = tags

In [8]:
question_to_tags = {
    q.Id: set(q.Tags)
    for _, q in df.iterrows()
    if q.Tags and not np.isnan(q.Id)
}

users = df.OwnerUserId.dropna().unique()
user_to_tags = {user: set() for user in users}
for _, q in df.iterrows():
    if not np.isnan(q.OwnerUserId) and q.Tags:
        user_to_tags[q.OwnerUserId] |= set(q.Tags)

In [9]:
len(question_to_tags), len(user_to_tags)

(62665, 32026)

In [10]:
tags_graph = nx.Graph()

In [11]:
user_attributes = {}
for user in users:
    tags_graph.add_node('u' + str(user))
    user_attributes[user] = {"type": "user", "tags": user_to_tags[user]}
nx.set_node_attributes(tags_graph, user_attributes)

In [13]:
likelihood_pairs = sparse_user_tags_likelihood(df.Tags, df.OwnerUserId, user_to_tags)

for user1, user2, w in likelihood_pairs:
    tags_graph.add_edge('u' + str(user1), 'u' + str(user2), weight=w)

extracting user pairs based on tags: 100%|██████████| 32026/32026 [04:42<00:00, 113.21it/s] 


In [14]:
tags_graph.number_of_edges()

74663

In [15]:
answers_graph = nx.Graph()

In [16]:
for user in users:
    answers_graph.add_node('u' + str(user))

In [17]:
answers["ParentId"].isin(df["Id"]).value_counts()  # great

ParentId
True    161867
Name: count, dtype: int64

In [18]:
df["AcceptedAnswerId"].isin(answers["Id"]).value_counts()  # sus!

AcceptedAnswerId
True     34532
False    28133
Name: count, dtype: int64

In [19]:
users = answers.OwnerUserId.dropna().unique()
user_to_questions = _make_user_to_answers(answers.ParentId, answers.OwnerUserId)

In [20]:
likelihood_pairs = sparse_user_answers_likelihood(answers.ParentId, answers.OwnerUserId)

for user1, user2, w in likelihood_pairs:
    answers_graph.add_edge('u' + str(user1), 'u' + str(user2), weight=w)

extracting user pairs based on answers: 100%|██████████| 24999/24999 [02:23<00:00, 174.19it/s] 


In [21]:
answers_graph.number_of_edges()

5544

In [22]:
nx.write_edgelist(tags_graph, "../data/cache/tagsG.edgelist")
nx.write_edgelist(answers_graph, "../data/cache/answersG.edgelist")