**Author: S. Grunert**     
**Revised: January 31, 2016**     
**Topic: File and api connections in Python.**     
**Versions: Py 2.7**  

###Flat File Sources

In [12]:
#Basic file write and read.
sentence = "the quick brown fox jumps over the lazy dog\n"
print "Output to File:", sentence

outfile = open("sentence.txt", "w")
outfile.write(sentence)
outfile.close()

infile = open("sentence.txt", 'r')
print "Input from File:", infile.read()
infile.close()

Output to File: the quick brown fox jumps over the lazy dog

Input from File: the quick brown fox jumps over the lazy dog



In [3]:
# Pulling email addresses from a text file with regex and conversion into a single list.
import re

fhand = open('mbox-short.txt')
addr_list = list()
for line in fhand:
    line = line.rstrip()
    x = re.findall('^From: (\S+@\S+)', line)
    if not x : continue
    y = x[0]
    addr_list.append(y)
print addr_list

['stephen.marquard@uct.ac.za', 'louis@media.berkeley.edu', 'zqian@umich.edu', 'rjlowe@iupui.edu', 'zqian@umich.edu', 'rjlowe@iupui.edu', 'cwen@iupui.edu', 'cwen@iupui.edu', 'gsilver@umich.edu', 'gsilver@umich.edu', 'zqian@umich.edu', 'gsilver@umich.edu', 'wagnermr@iupui.edu', 'zqian@umich.edu', 'antranig@caret.cam.ac.uk', 'gopal.ramasammycook@gmail.com', 'david.horwitz@uct.ac.za', 'david.horwitz@uct.ac.za', 'david.horwitz@uct.ac.za', 'david.horwitz@uct.ac.za', 'stephen.marquard@uct.ac.za', 'louis@media.berkeley.edu', 'louis@media.berkeley.edu', 'ray@media.berkeley.edu', 'cwen@iupui.edu', 'cwen@iupui.edu', 'cwen@iupui.edu']


###URL Sources

In [24]:
#Collect child URL anchors from a parent. BeautifulSoup must be in the present working directory.
import urllib
import ssl
import re
from BeautifulSoup import *

#Pull the html into BeautifulSoup.
url = 'https://www.coursera.org/'
scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
uh = urllib.urlopen(url, context=scontext)
html = uh.read()
soup = BeautifulSoup(html)

#Pull the anchors starting with http.
tags = soup('a')
counter = 0
search_limit = 50
for tag in tags:
    counter = counter + 1
    child_url = tag.get('href', None)
    if re.search('^http', child_url): print child_url
    if counter == search_limit: break

https://itunes.apple.com/app/apple-store/id736535961?pt=2334150&ct=Coursera%20Web%20Promo%20Banner&mt=8
http://play.google.com/store/apps/details?id=org.coursera.android
http://tech.coursera.org/app-platform
https://plus.google.com/+Coursera
http://twitter.com/coursera
https://www.facebook.com/Coursera
http://blog.coursera.org
http://tech.coursera.org


###XML Sources

In [31]:
#Pulling a node from an XML document.
import urllib
import xml.etree.ElementTree as ET

url = 'http://www.w3schools.com/xml/simple.xml'
uh = urllib.urlopen(url)
data = uh.read()
tree = ET.fromstring(data)
menu = tree.findall('food') # Use 'food/name' format if accessing a child node.
for item in menu:
    print item.find('name').text


Belgian Waffles
Strawberry Belgian Waffles
Berry-Berry Belgian Waffles
French Toast
Homestyle Breakfast


###JSON Sources

In [44]:
#Read data from JSON key-value pairs.
import json

data = '''{"employees":[
    {"firstName":"John", "lastName":"Doe"},
    {"firstName":"Anna", "lastName":"Smith"},
    {"firstName":"Peter", "lastName":"Jones"}
]}'''
info = json.loads(data)
for item in info['employees']:
    print item["firstName"], item["lastName"]


John Doe
Anna Smith
Peter Jones


In [45]:
#Manipulate data from a remote JSON file.
import urllib
import json

url = 'http://python-data.dr-chuck.net/comments_42.json'
uh = urllib.urlopen(url)
data = uh.read()
info = json.loads(data)
comments = info["comments"]
ttl = 0
count = 0
for item in comments:
    num = int(item["count"])
    ttl = ttl + num
    count = count + 1
print 'Count',count
print 'Sum',ttl

Count 50
Sum 2482


###Google Geo API

In [15]:
#Pull back the geo JSON from the google api and parse out longitude and latitude.
import urllib
import json

serviceurl = 'http://maps.googleapis.com/maps/api/geocode/json?'
address = raw_input("Enter location: ")
if len(address) < 1 : address  = "San Francisco"
url = serviceurl + urllib.urlencode({'sensor':'false', 'address': address})
print 'Retrieving:', url
uh = urllib.urlopen(url)
data = uh.read()
js = json.loads(str(data))
if 'status' not in js or (js['status'] != 'OK' and js['status'] != 'ZERO_RESULTS') : 
    print '==== Failure To Retrieve ===='
print 'Long Name:', js["results"][0]["formatted_address"]
print 'Longitude:', js["results"][0]["geometry"]["location"]["lng"]
print 'Latitude:', js["results"][0]["geometry"]["location"]["lat"]

Enter location: Los Angeles
Retrieving: http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address=Los+Angeles
Long Name: Los Angeles, CA, USA
Longitude: -118.2436849
Latitude: 34.0522342


###Twitter API

In [9]:
#Retrieve the friends for a twitter account.
#Requires hidden files in pwd to generate oauth url.

import urllib
import json
import twurl #Hidden file.

TWITTER_URL = 'https://api.twitter.com/1.1/friends/list.json'

while True:
    print ''
    acct = raw_input('Enter Twitter Account:')
    if ( len(acct) < 1 ) : break
    url = twurl.augment(TWITTER_URL,
        {'screen_name': acct, 'count': '5'} )
    #print 'Retrieving', url
    connection = urllib.urlopen(url)
    data = connection.read()
    headers = connection.info().dict
    print 'Remaining', headers['x-rate-limit-remaining']
    js = json.loads(data)
    #print json.dumps(js, indent=4)

    for u in js['users'] :
        print u['screen_name']
        s = u['status']['text']
        print '  ',s[:50]


Enter Twitter Account:leanpub
Remaining 14
rdpeng
   RT @novalsi: @astVintageSpace @rdpeng TIL there we
jurgenappelo
   A sweet spot of perspectives... What's the Purpose
IvaCheung
   @jamesday76 Cool! Good luck with it. Took me a whi
taylorotwell
   @ianlandsman @fideloper daily newsletter on how yo
jeffrey_way
   RT @davert: In other news: @codeception is going t

Enter Twitter Account:


###Socket Connections

In [8]:
#Make a socket connection to the XML document used above.

import socket

mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('www.w3schools.com', 80))
mysock.send('GET http://www.w3schools.com/xml/simple.xml HTTP/1.0\n\n')

while True:
    data = mysock.recv(1024)
    if ( len(data) < 1 ) :
        break
    print data;

mysock.close()

HTTP/1.0 200 OK
Cache-Control: public,max-age=3600,public
Content-Type: text/xml
Date: Mon, 01 Feb 2016 05:12:30 GMT
Etag: "df2594a8329d01:0+ident"
Last-Modified: Tue, 06 Jan 2015 07:35:05 GMT
Server: ECS (sjc/4FBE)
Vary: Accept-Encoding
X-Cache: HIT
X-Powered-By: ASP.NET
Content-Length: 1102
Connection: close

<?xml version="1.0" encoding="UTF-8"?>
<breakfast_menu>
	<food>
		<name>Belgian Waffles</name>
		<price>$5.95</price>
		<description>Two of our famous Belgian Waffles with plenty of real maple syrup</description>
		<calories>650</calories>
	</food>
	<food>
		<name>Strawberry Belgian Waffles</name>
		<price>$7.95</price>
		<description>Light Belgian waffles covered with strawberries and whipped cream</description>
		<calories>900</calories>
	</food>
	<food>
		<name>Berry-Berry Belgian Waffles</name>
		<price>$8.95</price>
		<description>Light Belgian waffles covered with an assortment of fresh berries and whipped cream</description>
		<calories>900<