In [1]:
from urllib.request import urlopen
import json
import re
from urllib.parse import quote
import pandas as pd
from pandas.io.json import json_normalize
import collections
import itertools
import networkx as nx
import matplotlib.pyplot as plt
from fa2 import ForceAtlas2
# conda install -c phlya adjusttext 
from adjustText import adjust_text

In [2]:
# Get data
n = 2500

# Initial query - get most active wikis
query = "http://www.wikia.com/api/v1/Wikis/List?expand=1&limit="+str(n)+"&batch=1"
response = urlopen(query)
wikisource = response.read()
data = json.loads(wikisource)
df = pd.DataFrame(data['items'])
fullDataSet = df
 
# Search for more wikis (based on the different hub names)
def searchForWiki(hub, n):
    querySearch = "http://www.wikia.com/api/v1/Wikis/ByString?expand=1&string="+str(hub)+"&limit="+str(n)+"&batch=1&includeDomain=true"
    response = urlopen(querySearch)
    data = json.loads(response.read())
    return pd.DataFrame(data['items']) 
    
# use hubs in dataset to searsh for wikis
for hub in set(df["hub"]):
    df2 = searchForWiki(hub, n)
    fullDataSet = fullDataSet.append(df2)
    print("%s wikis found in %s" % (str(len(df2)), hub))
    

250 wikis found in Games
250 wikis found in Other
30 wikis found in Lifestyle
250 wikis found in Books
250 wikis found in TV
250 wikis found in Comics
250 wikis found in Movies
250 wikis found in Music


In [43]:
# remove inactive wikis
def findActiveWikis(dataSet):
    activeDataSet = pd.DataFrame()
    for s in dataSet['stats']:
        data = dataSet.loc[dataSet['stats']==s]
        if s['activeUsers']>1:
            activeDataSet = activeDataSet.append(data)
    return activeDataSet

activeDataSet = findActiveWikis(fullDataSet)
print(len(activeDataSet))
print(len(fullDataSet))

263
2030


In [44]:
# remove duplicates
print(len(activeDataSet))
activeDataSet = activeDataSet.drop_duplicates(subset="id")
activeDataSet = activeDataSet.reset_index()
print(len(activeDataSet))

263
259


In [45]:
def findUsernames(userID):
    querySearch = "http://community.wikia.com/wiki/Special:Search?search="+userID+"&fulltext=Search&ns2=1"
    response = urlopen(querySearch)
    wikisource = response.read()
    usernames = set(re.findall("User:(\w+)?",str(wikisource)))
    return usernames

In [None]:
def findWikisThroughUser(usernames):
    wikis = []
    for username in usernames:
        querySearch = "http://community.wikia.com/index.php?limit=1000&tagfilter=&title=Special%3AContributions&target="+username+"&namespace=&tagfilter=&year=&month=-1"
        response = urlopen(querySearch)
        wikisource = response.read()
        wikis.append(set(re.findall("Adoption:(\w+)?",str(wikisource))))
    return wikis

In [None]:
def addUserToWiki(wiki,userID, dataSet):
    series = dataSet.loc[dataSet['name'] == wiki]
    topUsers = list(series['topUsers'].values)
    topUsers = list(itertools.chain.from_iterable(topUsers))
    if userID not in topUsers:
        topUsers.append(int(userID))
        dataSet.at[series.index.values[0], "topUsers"] = set(topUsers)
    return dataSet

In [None]:
def printUserInWiki(userID, dataSet):
    i = 0
    j = 0
    for users in list(dataSet['topUsers'].values):
        if int(userID) in list(users):
            i+=1
        else:
            j+=1
    #print("%s user found in %s wikis, not in %s wikis" % (userID,i,j))  
    return i

In [52]:
def addWikis(userID, dataSet):
    wikiList = findWikisThroughUser(findUsernames(userID))
    i = printUserInWiki(userID, dataSet)
    
    for wikis in list(wikiList):
        for wiki in set(list(wikis)):
            wiki = wiki.replace("_", " ")
            if wiki in list(dataSet['name']):
                #print("%s found in Dataset" % wiki)
                dataSet = addUserToWiki(wiki,userID, dataSet)
            else:
                wiki = wiki.replace(" ", "_")
                wikidf = searchForWiki(wiki, 250)
                # wikidf = findActiveWikis(wikidf)
                dataSet = dataSet.append(wikidf)
                dataSet = dataSet.drop_duplicates(subset="name")
                dataSet = dataSet.reset_index(drop=True)
                wiki = wiki.replace("_", " ")
                if wiki in dataSet['name']:
                    dataSet = addUserToWiki(wiki,userID, dataSet)
                    print("wiki %s added to Dataset and user added" % wiki)
                #else:
                    #print("--- %s not found in Dataset" % wiki)
    
    #j = printUserInWiki(userID, fullDataSet)
    #print("User added to %s new wikis" % j-i)
    return dataSet

In [None]:
# make graph
G = nx.Graph()

# find topusers
#topUsers = []
topUsers = [x for x in activeDataSet["topUsers"]]
topUsers = list(itertools.chain.from_iterable(topUsers))
for i in range(len(topUsers)):
    activeDataSet = addWikis(str(topUsers[i]), activeDataSet)
    activeDataSet = findActiveWikis(activeDataSet)
    if(i%100==0):
        print("%s af %s, wiki count = %s" % (str(i),str(len(topUsers)),str(len(activeDataSet))))
        
topUsers = [x for x in activeDataSet["topUsers"]]
topUsers = list(itertools.chain.from_iterable(topUsers))

counter=collections.Counter(topUsers)
#print(counter)
#print(counter.values())
#print(counter.keys())
print(counter.most_common(7))

# add nodes
for wiki in activeDataSet["name"]:
    G.add_node(wiki, hub=set(activeDataSet['hub'].loc[activeDataSet['name'] ==wiki].values))


0 af 2039, wiki count = 259


In [None]:
print(len(topUsers))

In [None]:
# find the wiki names of the users
users = {}
for user in counter.most_common():
    user = user[0]
    for k, v in df.T.items():
        if user in v["topUsers"]:
            if user not in users:
                users[user] = []
            users[user].append(v['name'])

# create edges between all wikis which share users
def createEdges(G,userWikis):
    length = len(userWikis)
    if length==1:
        return G
    elif length>1:
        wiki = userWikis.pop()
        for w in userWikis:
            if G.has_edge(wiki, w):
                G[wiki][w]['weight']+=1
            else:
                G.add_edge(wiki,w,weight=1)
        return createEdges(G,userWikis)
    else:
        return G

# iterate over all users
for k,v in set(users.items()):
    print(set(v))
    G = createEdges(G,set(v))

In [None]:
# Set layout
forceatlas2 = ForceAtlas2(
                          # Behavior alternatives
                          outboundAttractionDistribution=False,  # Dissuade hubs
                          linLogMode=False,  # NOT IMPLEMENTED
                          adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
                          edgeWeightInfluence=1,

                          # Performance
                          jitterTolerance=0.5,  #1 # Tolerance
                          barnesHutOptimize=True,
                          barnesHutTheta=1.2,
                          multiThreaded=False,  # NOT IMPLEMENTED

                          # Tuning
                          scalingRatio=0.01, #0.01
                          strongGravityMode=False, #False
                          gravity=10, #15

                          # Log
                          verbose=True)


# Calculate Positions
positions = forceatlas2.forceatlas2_networkx_layout(G, pos=None, iterations=4000)

In [None]:
# plot figure
plt.figure(figsize=(18, 18))             
node_size = [(G.degree(node, weight='weight')*20) for node in G.nodes()]  

labels = {}    
for node in G.nodes():
    if G.degree(node, weight='weight')*20 > 80:
        #set the node name as the key and the label as its value 
        labels[node] = node

node_label = [node for node in G.nodes() ]
nx.draw_networkx_nodes(G, positions, nodelist=G.nodes, node_size=node_size,cmap=plt.get_cmap('jet'))
nx.draw_networkx_edges(G, positions, width=0.5, cmap=plt.get_cmap('jet'))
nx.draw_networkx_labels(G, positions, labels=labels, font_size=18, font_color='k', font_weight='normal', alpha=2.0)
#texts = [plt.text(n, labels[n], ha='center', va='center') for n in G.nodes()]
#adjust_text(texts)#, only_move='y', arrowprops=dict(arrowstyle="->", color='r', lw=0.5))
plt.axis('off')
plt.show()

In [None]:
len(G.edges())

In [None]:
len(G.nodes())