In [1]:
from IPython.core.display import display, HTML
from string import Template
import pandas as pd
import json, random
from helpers import *
import time
from tqdm import tqdm_notebook
from fa2 import ForceAtlas2
import matplotlib.pyplot as plt
from dateutil import relativedelta
from datetime import timedelta

In [2]:
articles = pd.read_csv("./Data/wikispeedia_paths-and-graph/articles.tsv",sep='\t',comment='#',header=None,names=['article'])
categories = pd.read_csv("./Data/wikispeedia_paths-and-graph/categories.tsv",sep='\t',comment='#',header=None,names=['article', 'category'])
links = pd.read_csv("./Data/wikispeedia_paths-and-graph/links.tsv",sep='\t',comment='#',header=None,names=['linkSource', 'linkTarget'])

KEEP ONLY SCIENCE ARTICLES

code to keep more categories than only science
categories_filtered = filterCategories(categories, top = 1)

GET SCIENCE.SUB_SUBJECT

In [3]:
nodes = filterScienceCategory(categories)

GET USER_ACTIVITY

In [4]:
s = datetime.datetime(year=2015,month=11,day=1)
e = datetime.datetime(year=2018,month=11,day=1)

In [5]:
timer = time.localtime(time.time())
requests = 0
requests_tot = 0
granularity = 'monthly'
user_activity_dict = {}
to_drop = []

for index,row in tqdm_notebook(nodes.iterrows(),total = nodes.shape[0]):
    requests +=1
    requests_tot+=1
    values = getUserActivity(article=row['article'],granularity=granularity,start=s,end=e)
    
    if (len(values) <= int(monthdelta(s,e) - 1) and granularity == 'monthly') or (len(values) <= int((e - s).days) and granularity == 'daily'):
            
        print("drop " + row['article'])
        to_drop.append(index)
        nodes.drop(index, inplace=True)
        
    else:
        user_activity_dict[index] = values
        
    #to fit the limit of 100req/s    
    current_time = time.localtime(time.time())
    time_elapsed_sec = current_time[5] - timer[5]
    time_elapsed_min = current_time[4] - timer[4]
    if  time_elapsed_min < 1 and time_elapsed_sec < 0 and requests >= 100:
        time.sleep(max(time_elapsed_sec,0))
        timer = time.localtime(time.time())
        requests = 0
    elif time_elapsed_min >= 1 and time_elapsed_sec >=0:
        timer = time.localtime(time.time())
        requests = 0

HBox(children=(IntProgress(value=0, max=1105), HTML(value='')))

drop Arp2_3_complex



In [6]:
nodes, edges = iterative_filter(nodes,links)

In [7]:
len(nodes)

1097

In [8]:
len(edges)

13971

In [8]:
user_activity_df = create_df_activity(user_activity_dict)

In [9]:
normalized_user_activity_df = normalize_activity(user_activity_df, factor=40, byColumn=False)

In [10]:
# Create dataframe that contain the mean of each line
user_activity_df_mean = normalized_user_activity_df.copy()
user_activity_df_mean['mean'] = user_activity_df_mean.mean(axis=1)
user_activity_df_mean['std'] = normalized_user_activity_df.std(axis=1)

CREATE GRAPH

In [11]:
G = createGraph(nodes, edges)

In [12]:
len(nodes) == len(G.nodes)

True

COMPUTE POSITIONS

In [13]:
#positions = layout(G, algorithm='kamada_kawai')
positions = nx.kamada_kawai_layout(G)
#nodes = nodes.set_index('article')
#nodes['coord'] = pd.Series(positions,index=nodes.index)

In [14]:
x_mean = 0
y_mean = 0
x_std = 0
y_std = 0
for k,pos in positions.items():
    x_mean += pos[0]
    y_mean += pos[1]
x_mean /= len(positions)
y_mean /= len(positions)

for k,pos in positions.items():
    x_std += abs(pos[0]-x_mean)
    y_std += abs(pos[1]-y_mean)

x_std = np.sqrt(x_std/len(positions))
y_std = np.sqrt(y_std/len(positions))

In [15]:
nodes['coord'] = nodes['article'].map(positions)

COMPUTE COLORS

In [16]:
color_dict = create_color(nodes)

Add linkSource category to each edges to further compute color

In [17]:
edges = pd.merge(edges,nodes,left_on='linkSource',right_on='article',how='left')

In [18]:
edges = edges.drop(labels=['article'],axis= 1)

In [19]:
edges['color'] = edges.category.apply(lambda x : "rgba"+str(color_dict[x]).replace(' ',''))
nodes['color'] = nodes.category.apply(lambda x : "rgba"+str(color_dict[x]).replace(' ',''))

COMPUTE SPANISH

In [21]:
timer = time.localtime(time.time())
requests_ = 0
requests_tot = 0
language_code = "es"
lang_dict = {}
user_activity_dict_es = {}

for index,row in tqdm_notebook(nodes.iterrows(),total = nodes.shape[0]):
    requests_ +=2
    requests_tot+=2
    new_title = getArticleName(row['article'],language_code)
    found = False
    if new_title != "":
        values = getUserActivity(article=new_title,project=language_code+".wikipedia.org",granularity=granularity,start=s,end=e)
        if not((len(values) <= int(monthdelta(s,e) - 1) and granularity == 'monthly') or (len(values) <= int((e - s).days) and granularity == 'daily')):
            user_activity_dict_es[index] = values
            lang_dict[row['article']] = new_title
            found = True
    
    if not found:
        lang_dict[row['article']] = "unknown"
   

HBox(children=(IntProgress(value=0, max=1097), HTML(value='')))




In [22]:
nodes['article_'+language_code] = nodes['article'].map(lang_dict)

In [23]:
user_activity_df_es = create_df_activity(user_activity_dict_es)

In [24]:
normalized_user_activity_df_es = normalize_activity(user_activity_df_es, factor=40, byColumn=False)

In [25]:
user_activity_df_mean_es = normalized_user_activity_df_es.copy()
user_activity_df_mean_es['mean'] = user_activity_df_mean_es.mean(axis=1)
user_activity_df_mean_es['std'] = normalized_user_activity_df_es.std(axis=1)

CONSTRUCT FINAL JSON

In [26]:
def parse_article_title(title):
    return urllib.parse.unquote(title).replace('_',' ')
def parse_color(color):
    return "rgba"+str(color).replace(' ','')

In [32]:
graph_data = { 'nodes': [], 'edges': [] }

for i,row in nodes.iterrows():
    
    x,y = w_stye_coordinate()
    
    to_be_added = {}
    for j in range(len(normalized_user_activity_df.loc[i])):
        to_be_added[str(j)+"_size"] = normalized_user_activity_df.loc[i][j]
        if row['article_es'] != "unknown":
            to_be_added[str(j)+"_size_fr"] = normalized_user_activity_df_es.loc[i][j]
        #to_be_added[str(activity_ind)+"_size"] = 0 if(elem[0]<=0) else(np.log(elem[0]))
    to_be_added['label'] = parse_article_title(row['article'])
    to_be_added['label_en'] = parse_article_title(row['article'])
    to_be_added['label_fr'] = parse_article_title(row['article_es'])
    to_be_added['x'] = x
    to_be_added["y"] = y
    to_be_added["id"]= row['article']
    to_be_added['id_en'] = parse_article_title(row['article'])
    to_be_added['id_fr'] = parse_article_title(row['article_es'])
    to_be_added["attributes"]= {"category":row['category']}
    to_be_added["user_activity"]= user_activity_dict[i]
    if row['article_es'] != "unknown":
        to_be_added["user_activity_fr"]= user_activity_dict_es[i]
        to_be_added["activity_mean_fr"]= user_activity_df_mean_es.loc[i]['mean']
        to_be_added["activity_std_fr"]= user_activity_df_mean_es.loc[i]['std']
    to_be_added["color"]= 'rgb(30,30,30)'
    to_be_added["originalColor"]= row['color']
    #"size": 0 if(user_activity_dict[i][0][0]==0) else (np.log(user_activity_dict[i][0][0]))
    to_be_added["size"]= 10
    to_be_added["square_x"]= x
    to_be_added["square_y"]= y
    to_be_added["square_size"]= 10
    to_be_added["square_color"]= 'rgb(30,30,30)'
    to_be_added["correct_x"]= row['coord'][0]
    to_be_added["correct_y"]= row['coord'][1]
    to_be_added["correct_size"]= normalized_user_activity_df.loc[i][0]
    to_be_added["correct_color"]= row['color']
    to_be_added["activity_mean"]= user_activity_df_mean.loc[i]['mean']  
    to_be_added["activity_std"]= user_activity_df_mean.loc[i]['std']
    graph_data['nodes'].append(to_be_added)

for i,row in edges.iterrows():
    graph_data['edges'].append({
            "id": str(i),
            "source": row['linkSource'],
            "target": row['linkTarget'],
            "attributes" : {},
            "color": row['color'],
            "originalColor": row['color'],
            #"color": "rgb(30,30,30)",
            #"originalColor": "rgb(30,30,30)",
            "size": 0.1,
            "type": 'curve'
        })

In [33]:
save_graph_json('../raffiot.github.io/raffiot.github.io/wikispedia/wikispedia_en_es.json',graph_data)

In [None]:
save_pkle(user_activity_df, nodes, edges)