In [1]:
import pandas as pd
import networkx as nx
import numpy as np

In [2]:
df = pd.read_csv("prepared_questions.csv")
df.sample(3)

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastEditorUserId,LastEditDate,...,ContentLicense,ParentId,OwnerDisplayName,LastEditorDisplayName,FavoriteCount,words_cnt,tech_words_cnt,negative_answers,is_dummy,is_complex
37406,314947,1,314975.0,2016-04-06T18:54:09.233,3,192.0,Is there a standard or widely accepted term fo...,220571.0,161917.0,2016-04-21T06:44:43.553,...,CC BY-SA 3.0,,,,,169,4,0,False,False
11047,137666,1,137676.0,2012-02-29T20:34:58.887,6,1358.0,I'm currently using decorators to apply variou...,45957.0,45957.0,2012-02-29T22:29:51.037,...,CC BY-SA 3.0,,,,,98,5,0,False,False
1063,20800,1,20820.0,2010-11-23T08:04:39.047,7,4345.0,I just want to hear some pro and con's - it's ...,,25936.0,2012-03-07T17:06:59.700,...,CC BY-SA 2.5,,oneWhoNeedGuidenss,,,70,1,0,True,False


In [3]:
df.AcceptedAnswerId.isna().sum(), df.shape[0]

(26935, 63423)

In [4]:
min_ = df.Score.min()
max_ = df.Score.max()
min_, max_ # max is too high

(-11, 2182)

In [5]:
mean = df["Score"].mean()
std = df["Score"].std()

df_filtered = df[(df["Score"] >= mean - 3 * std) & (df["Score"] <= mean + 3 * std)]
min_f = df_filtered.Score.min()
max_f = df_filtered.Score.max()
min_f, max_f  # MAYBE USE FILTERED DF?

(-11, 81)

In [6]:
df = df.dropna(subset=["Tags"])
tag_series = df["Tags"]
tags = [str(tags).split("|")[1:-1] for tags in tag_series]
df["Tags"] = tags

In [7]:
question_to_tags = {
    q.Id: set(q.Tags)
    for _, q in df.iterrows()
    if q.Tags and not np.isnan(q.Id)
}

users = df.OwnerUserId.dropna().unique()
user_to_tags = {user: set() for user in users}
for _, q in df.iterrows():
    if not np.isnan(q.OwnerUserId) and q.Tags:
        user_to_tags[q.OwnerUserId] |= set(q.Tags)

In [8]:
len(question_to_tags), len(user_to_tags)

(63423, 32217)

In [9]:
G = nx.Graph()

In [10]:
user_attributes = {}
for user in users:
    G.add_node('u' + str(user))
    user_attributes[user] = {"type": "user", "tags": user_to_tags[user]}
nx.set_node_attributes(G, user_attributes)

In [11]:
bound = 0.8
for user1 in users:
    for user2 in users:
        if not np.isnan(user1) and not np.isnan(user2) and user1 < user2:
            user1_tags = user_to_tags[user1]
            user2_tags = user_to_tags[user2]
            if len(user1_tags & user2_tags) != 0 and (w := (len(user1_tags & user2_tags) / len(user1_tags | user2_tags))) >= bound:
                G.add_edge('u' + str(user1), 'u' + str(user2), weight=w)

In [12]:
G.number_of_edges()

74923

In [13]:
G2 = nx.Graph()

In [14]:
answers = pd.read_csv("prepared_answers.csv", low_memory=False)

In [15]:
answers["ParentId"].isin(df["Id"]).value_counts()  # great

ParentId
True    171806
Name: count, dtype: int64

In [16]:
df["AcceptedAnswerId"].isin(answers["Id"]).value_counts()  # sus!

AcceptedAnswerId
True     35097
False    28326
Name: count, dtype: int64

In [17]:
users = answers.OwnerUserId.dropna().unique()
user_to_questions = {user: set() for user in users}
for _, answ in answers.iterrows():
    if not np.isnan(answ.OwnerUserId) and answ.ParentId:
        user_to_questions[answ.OwnerUserId].add(answ.ParentId)

In [23]:
bound = 0.8
for user1 in users:
    for user2 in users:
        if not np.isnan(user1) and not np.isnan(user2) and user1 < user2:
            user1_answered = user_to_questions[user1]
            user2_answered = user_to_questions[user2]
            if len(user1_answered & user2_answered) != 0 and (w := (len(user1_answered & user2_answered) / len(user1_answered | user2_answered))) >= bound:
                G2.add_edge('u' + str(user1), 'u' + str(user2), weight=w)

In [26]:
G2.number_of_edges()

368749

In [25]:
nx.write_edgelist(G, "tagsG.edgelist")
nx.write_edgelist(G2, "questionsG.edgelist")