In [1]:
import pandas as pd
import networkx as nx
import numpy as np

from user_likelihood_metrics import _make_user_to_tags, sparse_user_tags_likelihood, _make_user_to_answers, sparse_user_answers_likelihood

In [2]:
df = pd.read_csv("../data/cache/prepared_questions_metrics.csv")
df.sample(3)

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastEditorUserId,LastEditDate,...,ContentLicense,ParentId,OwnerDisplayName,LastEditorDisplayName,FavoriteCount,words_cnt,tech_words_cnt,negative_answers,is_dummy,is_complex
41089,334964,1,334966.0,2016-10-30T20:40:02.187,3,3189.0,I'm trying to figure out how to create abstrac...,247362.0,247362.0,2016-10-30T20:59:38.297,...,CC BY-SA 3.0,,,,,425,18,0,False,True
30659,273463,1,273488.0,2015-02-17T06:44:38.750,20,7123.0,I came across Event Sourcing design and I woul...,81940.0,6605.0,2015-02-17T07:06:02.323,...,CC BY-SA 3.0,,,,,297,2,0,True,False
20663,205936,1,,2013-07-24T14:37:03.467,1,985.0,Normally TCP/IP Sequence and Acknowledge numbe...,,,,...,CC BY-SA 3.0,,user15279,,,71,0,0,True,False


In [3]:
df.AcceptedAnswerId.isna().sum(), df.shape[0]

(26935, 63423)

In [4]:
min_ = df.Score.min()
max_ = df.Score.max()
min_, max_ # max is too high

(-11, 2182)

In [5]:
mean = df["Score"].mean()
std = df["Score"].std()

df_filtered = df[(df["Score"] >= mean - 3 * std) & (df["Score"] <= mean + 3 * std)]
min_f = df_filtered.Score.min()
max_f = df_filtered.Score.max()
min_f, max_f  # MAYBE USE FILTERED DF?

df = df_filtered

In [6]:
df = df.dropna(subset=["Tags"])
tag_series = df["Tags"]
tags = [str(tags).split("|")[1:-1] for tags in tag_series]
df["Tags"] = tags

In [7]:
question_to_tags = {
    q.Id: set(q.Tags)
    for _, q in df.iterrows()
    if q.Tags and not np.isnan(q.Id)
}

users = df.OwnerUserId.dropna().unique()
user_to_tags = _make_user_to_tags(df.Tags, df.OwnerUserId)

In [8]:
len(question_to_tags), len(user_to_tags)

(62665, 33536)

In [9]:
tags_graph = nx.Graph()

In [10]:
user_attributes = {}
for user in users:
    tags_graph.add_node('u' + str(user))
    user_attributes[user] = {"type": "user", "tags": user_to_tags[user]}
nx.set_node_attributes(tags_graph, user_attributes)

In [11]:
likelihood_pairs = sparse_user_tags_likelihood(df.Tags, df.OwnerUserId)

for user1, user2, w in likelihood_pairs:
    tags_graph.add_edge('u' + str(user1), 'u' + str(user2), weight=w)

100%|██████████| 32026/32026 [05:16<00:00, 101.25it/s] 


In [12]:
tags_graph.number_of_edges()

74663

In [13]:
answers_graph = nx.Graph()

In [14]:
for user in users:
    answers_graph.add_node('u' + str(user))

In [15]:
answers = pd.read_csv("../data/cache/prepared_answers_metrics.csv", low_memory=False)

In [16]:
answers["ParentId"].isin(df["Id"]).value_counts()  # great

ParentId
True     161867
False      9939
Name: count, dtype: int64

In [17]:
df["AcceptedAnswerId"].isin(answers["Id"]).value_counts()  # sus!

AcceptedAnswerId
True     34532
False    28133
Name: count, dtype: int64

In [18]:
users = answers.OwnerUserId.dropna().unique()
user_to_questions = _make_user_to_answers(answers.ParentId, answers.OwnerUserId)

In [19]:
likelihood_pairs = sparse_user_answers_likelihood(answers.ParentId, answers.OwnerUserId)

for user1, user2, w in likelihood_pairs:
    answers_graph.add_edge('u' + str(user1), 'u' + str(user2), weight=w)

100%|██████████| 26861/26861 [03:05<00:00, 144.83it/s] 


In [20]:
answers_graph.number_of_edges()

15122

In [21]:
nx.write_edgelist(tags_graph, "../data/cache/tagsG.edgelist")
nx.write_edgelist(answers_graph, "../data/cache/answersG.edgelist")