Permalink
Browse files

Fixed broken graph construction

Several things:
- networkx graphs are constructed correctly from manual blogroll
- get titles of blogs from json files, not search on internet every time
(saves about 15 min)
- write .dot, .gml, or .pickle files based on extension
- updated some documentation
  • Loading branch information...
randomjohn committed Nov 5, 2012
1 parent 6578b29 commit aa6a90296072b8fae9943e5a819c7d11a60709a6
Showing with 11,178 additions and 6,178 deletions.
  1. +1 −0 README
  2. +29 −16 build_graph.py
  3. +15 −15 manual_blogroll.txt
  4. +1,316 −0 out/blogs.dot
  5. +3,650 −1,093 out/blogs.gml
  6. +6,148 −5,043 out/blogs.pickle
  7. BIN out/statistics_blogs.gephi
  8. +19 −11 sna_project.MININT-JSFC1SN.John.pui
View
1 README
@@ -27,6 +27,7 @@ Caveats
* Links to Andrew Gelman's blog are very diverse. He has several addresses. I standardized them to http://www.andrewgelman.com
* Same with Simply Stats, standardized to http://simplystatistics.org
+* And Flowing Data, all standardized to http://www.flowingdata.com
* There are many links from inside to outside the statistics web, for example to econometrics, sociology, mathematics, and CS. I had to stop following them somewhere, and sometimes the break may seem arbitrary. I had to balance time and return on value to the project.
How to run the analysis:
View
@@ -50,13 +50,24 @@ def build_graphs_from_json(blog_file):
def build_graph_from_manual( blog_file, add_labels=False, filename="out/blogs_manual.dot" ):
blog_list = [line for line in file(blog_file)]
blog_gr=nx.DiGraph()
- print filename
+
+ # create a dictionary of blog url to title out of the json files
+ url_titles = {}
+ json_files = [fil for fil in os.listdir("out/") if fil.endswith(".json")]
+ for json_file in json_files:
+ blog_props=json.load(file("out/"+json_file))
+ url_titles[blog_props[0]['blogurl'].strip().replace('"','')]=blog_props[0]['title']
for blog in blog_list:
blog=blog.strip()
# split on the semicolon, on the left is the node and the right are outlinks
bl_list = blog.split(';',2)
- blog_gr.add_node(bl_list[0])
+ try:
+ # add node with a label that is the title
+ blog_gr.add_node(bl_list[0],label=url_titles[bl_list[0]])
+ except KeyError:
+ # if we didn't find it in json files, just default to url as label
+ blog_gr.add_node(bl_list[0],label=bl_list[0])
if len(bl_list)>1 and bl_list[1]!="":
if bl_list[0].find("visualcomplexity.com")>-1:
# some debugging
@@ -69,30 +80,32 @@ def build_graph_from_manual( blog_file, add_labels=False, filename="out/blogs_ma
outlink=outlink[-1]
elif (len(outlink)==1 or outlink==''):
# skip some slop
- continue
+ continue
+ # explicitly add node if not already in
+ if outlink not in blog_gr.nodes():
+ try:
+ # add node with a label that is the title
+ blog_gr.add_node(outlink,label=url_titles[outlink])
+ except KeyError:
+ # if we didn't find it in json files, just default to url as label
+ blog_gr.add_node(outlink,label=outlink)
+
blog_gr.add_edge(bl_list[0],outlink)
- # add labels to nodes, if we need to
- if (add_labels):
- for n in blog_gr:
- blog_gr[n]['title'] = ''
- try:
- webpage_title = le.extract_title_from_url(n)
- print >> sys.stderr, 'Note: web page at ' + n + ' has title ' + webpage_title
- blog_gr[n]['title'] = webpage_title.strip().replace('\n','').replace(' ','')
- except:
- print >> sys.stderr, 'Note: Could not parse ' + n
- blog_gr[n]['title'] = n
if filename.endswith(".dot"):
#write out by hand
- node_dot = ['"%s" [label="%s"]' % (n,blog_gr[n]['title']) for n in blog_gr]
+ node_dot = ['"%s" [label="%s"]' % (n[0],n[1]['label'].replace('"','')) for n in blog_gr.nodes(data=True)]
edge_dot = ['"%s" -> "%s"' % (n1, n2) for n1,n2 in blog_gr.edges() if n2!="title"]
f = open(filename,'w')
f.write('strict digraph{\n%s\n%s\n}' % (";\n".join(node_dot).encode('ascii','ignore'),";\n".join(edge_dot).encode('ascii','ignore')))
f.close()
elif filename.endswith(".pickle"):
f = open(filename,'wb')
cPickle.dump(blog_gr,f)
- f.close()
+ f.close()
+ elif filename.endswith(".gml"):
+ f=open(filename,'w')
+ nx.write_gml(blog_gr,f)
+ f.close()
return
def make_feedlist_from_file(blog_file,out_file="feedlist_manual.txt"):
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
Oops, something went wrong.

0 comments on commit aa6a902

Please sign in to comment.