In [16]:
# imports
from urllib.request import urlopen
import json
import re
from urllib.parse import quote
import pandas as pd
from pandas.io.json import json_normalize
import collections
import itertools
import networkx as nx
import matplotlib.pyplot as plt
from fa2 import ForceAtlas2
import numpy as np
import datetime
from lxml import html
import glob
# conda install -c phlya adjusttext 
#from adjustText import adjust_text

In [17]:
###### Get inital data #####
n = 250

# Initial query - get top wikis by pageviews using API
query = "http://www.wikia.com/api/v1/Wikis/List?expand=1&limit="+str(n)+"&batch=1"
response = urlopen(query)
wikisource = response.read()
data = json.loads(wikisource)

# Save data at data/wikis/top-wikis-date.json
date = datetime.date.today().strftime("%d-%m-%Y")
with open("data/wikis/top-wikis-"+date+".json", 'w') as outfile:
    json.dump(data, outfile)

In [3]:
##### Read initial wiki data  ######
df = pd.DataFrame()
with open("data/wikis/top-wikis-"+date+".json") as f:
    json_file = json.loads(f.read())
df = pd.DataFrame(json_file['items']) 

In [18]:
# Search for more wikis (primarily based on the different hub names)
def saveSearchWikis(search, n):
    querySearch = "http://www.wikia.com/api/v1/Wikis/ByString?expand=1&string="+str(search)+"&limit="+str(n)+"&batch=1&includeDomain=true"
    response = urlopen(querySearch)
    data = json.loads(response.read())
    
    if data['total'] > 0:
        date = datetime.date.today().strftime("%d-%m-%Y")
        searchWikiPath = "data/wikis/search-"+search+"-wikis-"+date+".json"
        with open(searchWikiPath, 'w') as outfile:
            json.dump(data, outfile)
    else:
        print("Data not found in %s: %s" % (search, data['items']))
    return pd.DataFrame(data['items']) 

In [5]:
# use hubs in dataset to searsh for new wikis
fullDataSet = pd.DataFrame()
for hub in set(df["hub"]):
    df = saveSearchWikis(hub, n)
    fullDataSet = fullDataSet.append(df)
    print("%s wikis found in %s" % (str(len(df)), hub))
fullDataSet = fullDataSet.reset_index(drop=True)

250 wikis found in Games
250 wikis found in TV
250 wikis found in Movies
250 wikis found in Comics
250 wikis found in Other
250 wikis found in Books
30 wikis found in Lifestyle


In [19]:
# search online to find the username based on the userID
def saveUsernames(keyword):
    keyword = str(keyword)
    querySearch = "http://community.wikia.com/wiki/Special:Search?search="+keyword+"&fulltext=Search&ns2=1"
    response = urlopen(querySearch)
    wikisource = response.read()
    usernames = re.findall("wiki\/User:([(\w)]+)?",str(wikisource))
    
    # Read exsisting data to avoid duplicates
    filename = "data/users/userNames.txt"
    fileRead = open(filename, "r")
    contents = fileRead.read()
    fileRead.close() 
    
    # Print new usernames
    for user in set(usernames):
        userData = user+"\n"
        if userData not in contents: # not a duplicate
            fileWrite = open(filename,"a")
            fileWrite.write(userData)
            fileWrite.close()

    return list(set(usernames))

In [8]:
# search online to find the username based on the userID
def saveUsernamesByUserID(userID):
    
    if userID == None:
        print("user ID is %s" % userID)
        return []
    
    if int(userID) < 1:
        print("user ID less than 1: %s" % userID)
        return []
    
    userID = str(userID)
    querySearch = "http://www.wikia.com/api/v1/User/Details?ids="+userID
    response = urlopen(querySearch)
    wikisource = response.read()
    data = json.loads(wikisource)
    
    # Read exsisting data to avoid duplicates
    filename = "data/users/userNames.txt"
    fileRead = open(filename, "r")
    contents = fileRead.read()
    fileRead.close() 
    
    usernames = []
    # Print new usernames
    for name in data['items']:
        userData = name['title']+" "+name['user_id']+"\n"
        usernames.append(name['title'])
        if userData not in contents: # not a duplicate
            fileWrite = open(filename,"a")
            fileWrite.write(userData)
            fileWrite.close()
        elif name['title'] in contents:
            content.replace(name['title'],userData)
            fileWrite = open(filename,"w")
            fileWrite.write(content)
            fileWrite.close()

    return usernames

In [14]:
# search online to find the wiki names which the user use, 
# This wiki saves the users wikis and all the wiki names
def saveNewWikisAndUsersThroughUser(usernames):
    wikis = []
    for username in usernames:
        
        # encode username correct
        username = str(username.encode('utf-8'))
        username = username.replace("'", "")
        username = username.replace("b", "")
        
        # find users activity
        querySearch = "http://community.wikia.com/index.php?limit=1000&tagfilter=&title=Special%3AContributions&target="+username+"&namespace=&tagfilter=&year=&month=-1.html"
        response = urlopen(querySearch)
        wikisource = response.read()
        
        
        # search html for wikis
        usersWikis = re.findall("wiki/Adoption:(\w+)?",str(wikisource))
        usersWikis = [wiki for wiki in usersWikis if (wiki != "Requests" and wiki != '')]
        usersWikis = list(set(usersWikis))
        
        # Save users and the individual user's wikis 
        userData = username+": "+str(usersWikis)+"\n"
        
        filename = "data/users/user-wikis.txt"
        fileRead = open(filename, "r")
        contents =fileRead.read()
        
        # Ensure no duplicates
        if userData not in contents: # not a duplicate
            fileWrite = open(filename,"a")
            fileWrite.write(userData)
            fileWrite.close()
        fileRead.close() 
        
        
        [wikis.append(wiki) for wiki in usersWikis]
        wikis = list(set(wikis))
        
    # Save all found wiki names and read exsisting data to avoid duplicates
    filename = "data/wikis/wikiNames.txt"
    fileRead = open(filename, "r")
    contents = fileRead.read()
    fileRead.close() 
    for wiki in set(wikis):
        wikiData = wiki+"\n"
        if wikiData not in contents:
            # Print new usernames
            fileWrite = open(filename,"a")
            fileWrite.write(wikiData)
            fileWrite.close()
    
    return wikis

In [10]:
def collectAndSaveData(userID):
    # Get and save all usernames
    username = saveUsernamesByUserID(userID)
    
    # Get and save all wikis found connected to the usernames
    wikiList = saveNewWikisAndUsersThroughUser(username)


In [11]:
# remove inactive wikis and duplicates
def findActiveWikis(dataSet):
    activeDataSet = pd.DataFrame()
    dataSet = dataSet.drop_duplicates(subset="id")
    # find stats in dataset
    for s in dataSet['stats']:
        # only add when at least 1 active
        if s['activeUsers']>1:
            data = dataSet.loc[dataSet['stats']==s]

            activeDataSet = activeDataSet.append(data)#    
    # remove duplicates
    #activeDataSet = activeDataSet.reset_index(drop=True)
    return activeDataSet

In [13]:
# Get active wikis
activeDataSet = findActiveWikis(fullDataSet)
print("full data set has %s data, while only %s is actively used." % (len(fullDataSet),len(activeDataSet)))
print()


# Find more users and their connected Wikis
# find topusers
topUsers = [x for x in activeDataSet["topUsers"]]
topUsers = list(set(itertools.chain.from_iterable(topUsers)))

# find founders
founders = [x for x in activeDataSet['founding_user_id']]

users = list(set(topUsers + founders))

for i in range(0,len(users)):
    collectAndSaveData(users[i])
    if i % 500 == 0:
        print(i, len(users))


full data set has 35039 data, while only 5845 is actively used.



TypeError: must be str, not int

In [12]:
##### Read Wiki Names ######
# 22:49 - 
wikiDataPath = [f for f in glob.glob("data/wikis/*.json")]
print(len(wikiDataPath))
i = 0
for dataPath in wikiDataPath:
    i+=1
    with open(dataPath) as f:
        json_file = json.loads(f.read())
    fullDataSet = fullDataSet.append(pd.DataFrame(json_file['items']))
    if(i%200 ==0):
        print(i)

fullDataSet = fullDataSet.drop_duplicates(subset="id")    
fullDataSet = fullDataSet.reset_index(drop=True) 

print(len(fullDataSet))
fullDataSet = findActiveWikis(fullDataSet)
print(len(fullDataSet))

2254
200
400
600
800
1000
1200
1400
1600
1800
2000
2200


35039

In [52]:
# Save Collected data
date = datetime.date.today().strftime("%d-%m-%Y")
fullDataSet.to_csv("data/sortedWikiData-"+date+".csv", sep='\t')

In [15]:
##### add Wiki Names ######
# 12.21
# Read exsisting data to avoid duplicates
wikiNames = []
filename = "data/wikis/wikiNames.txt"
f = open(filename, "r")
for line in f.readlines():
    wikiNames.append(line.replace("\n",""))
f.close() 

i = 0
for wikiName in wikiNames:
    i+=1
    df = saveSearchWikis(wikiName, n)
    fullDataSet = fullDataSet.append(df)
    if i%100==0:
        print("%i af %s" % (i,len(wikiNames)))

fullDataSet = fullDataSet.drop_duplicates(subset="name")    
fullDataSet = fullDataSet.reset_index(drop=True)    
len(fullDataSet)



Data not found in Adpotion_Request: []
Data not found in Annunaki_Genesis_Wiki: []
Data not found in Me_at_sp_in_: []
Data not found in Tintin_Wiki_4: []
Data not found in The_Miners_haven_project_Wikia: []
Data not found in Cyber_Nantions_Wiki: []
Data not found in Can_I_become_a_administrator_and_bureaucrat_on_this_wiki_communities: []
Data not found in Cyberantions: []
Data not found in Enter_wiki_name_here_: []
Data not found in Brave_Frontier_Global_Wiki: []
Data not found in Spleef_League_Wiki: []
Data not found in I_made_a_new_account: []
Data not found in Arabic_Steven_Universe_Wiki: []
Data not found in TheThroneOfGlass_Wiki: []
Data not found in Sakure_Hime_Kaden: []
Data not found in Test_Drive_Wiki_: []
100 af 2564
Data not found in 3d_movies_wiki: []
Data not found in Happy_Labs_Happy_Mall_Story_Wiki: []
Data not found in Ugly_Betty_Wikia: []
Data not found in Gruntipedia_: []
Data not found in The_Throne_of_Glass_Wiki: []
Data not found in Yuma100_Wiki: []
Data not found 

Data not found in Warriors_Answers_Wiki: []
Data not found in Mad_Cartoon_Network_Wiki_: []
Data not found in SoNyuhShiDae_Wiki_: []
Data not found in Encyclopedia_Scumdoggia_Wiki_: []
Data not found in Singapore_Transport: []
Data not found in Macne_Series_wikia: []
Data not found in Encyclopeadia_Scumdoggia_Wiki_: []
Data not found in Far_Cry_Wiki_2: []
Data not found in Rhett_and_Link_Wiki_: []
Data not found in Bizaardvark_Wiki_: []
Data not found in Fluffypedia_Wiki_3: []
Data not found in Prisoner_Cell_Block_H_Wiki_: []
Data not found in Singapore_Transport_Wiki_: []
1900 af 2564
Data not found in Crash_and_Fluttershy_Fanon_Wiki: []
Data not found in Dancedancerevolutionddr: []
Data not found in Spider_Riders_Wiki: []
Data not found in Czech_Naruto_Wiki: []
Data not found in Spider_Riders_Center: []
Data not found in The_Magic_School_Bus_wiki_: []
Data not found in RemiseTale_Wiki: []
Data not found in MOGAI_Encyclopedia_Wikia: []
Data not found in Encyclopaedia_Scumdoggia_Wiki_:

31479

In [55]:
users = [x for x in fullDataSet["topUsers"]]
users.append([x for x in activeDataSet['founding_user_id']])
              
users = list(itertools.chain.from_iterable(users))

counter=collections.Counter(users)
#print(counter)
#print(counter.values())
#print(counter.keys())
print(counter.most_common(7))

print(len(set(users)))

[(8, 1111), (22224, 1093), (957747, 1087), (36156286, 774), (5275700, 616), (4784321, 562), (29325840, 459)]
35983


In [None]:
# find the wiki names of the users
userWiki = {}
for user in list(set(users)):
    for k, v in fullDataSet.T.items():
        if user == v["topUsers"] or user == v['founding_user_id']:
            if user not in userWiki:
                userWiki[user] = []
            userWiki[user].append(v['name'])