In [170]:
%matplotlib inline
import pandas as pd
import re 
import csv
import requests
import sys
import numpy as np
from io import StringIO

In [2]:
# data from the house
LIMIT = 1000
from requests_futures.sessions import FuturesSession

session = FuturesSession(max_workers=5)
futures = [session.get('https://www.govtrack.us/congress/votes/115-2017/h%s/export/csv' 
                       %i) for i in range(LIMIT)]

In [3]:
files = []
for i in futures:
    files.append(i.result().text)

In [4]:
text = [f for f in files if (len(re.findall(r'DOCTYPE html', f)) == 0)]

In [35]:
def create_votes_df(text): # list of text
    expression = re.compile('(.*?)(\s)(#)([0-9]+)(\s)(.*?)(\s-\s)(.*)')
    df = pd.DataFrame()
    for t in text:
        des = t.split('\n')[0]
        sdes = re.match(expression, des)
        temp = pd.read_csv(StringIO(t), skiprows=1)
        temp['chamber'] = sdes.group(1)
        temp['date']  = pd.to_datetime(sdes.group(6))
        temp['number'] = sdes.group(4)
        temp['title'] = sdes.group(8)
        df = df.append(temp)
    return df

In [36]:
house_votes = create_votes_df(text)

In [119]:
house_votes.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 241265 entries, 2017-01-03 12:35:00 to 2017-10-10 19:01:00
Data columns (total 9 columns):
person      241265 non-null int64
state       241265 non-null object
district    241265 non-null int64
vote        241265 non-null object
name        241265 non-null object
party       241265 non-null object
chamber     241265 non-null object
number      241265 non-null object
title       241265 non-null object
dtypes: int64(2), object(7)
memory usage: 23.4+ MB


In [None]:
house_votes.set_index('date', inplace=True)

In [39]:
house_votes.head()

Unnamed: 0_level_0,person,state,district,vote,name,party,chamber,number,title
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-03 12:35:00,400004,AL,4,Present,Rep. Robert Aderholt [R],Republican,House Vote,1,Call by States
2017-01-03 12:35:00,400018,TX,6,Present,Rep. Joe Barton [R],Republican,House Vote,1,Call by States
2017-01-03 12:35:00,400021,CA,34,Present,"Rep. Xavier Becerra [D, 2013-2017]",Democrat,House Vote,1,Call by States
2017-01-03 12:35:00,400029,UT,1,Present,Rep. Rob Bishop [R],Republican,House Vote,1,Call by States
2017-01-03 12:35:00,400030,GA,2,Present,Rep. Sanford Bishop Jr. [D],Democrat,House Vote,1,Call by States


In [88]:
house = house_votes.loc['2017-7':'2017-9']

In [122]:
house.head()

Unnamed: 0_level_0,person,state,district,vote,name,party,chamber,number,title,most,share_vote
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-07-11 18:53:00,400004,AL,4,Yea,Rep. Robert Aderholt [R],Republican,House Vote,345,"H.R. 1397: To authorize, direct, facilitate, a...",Yea,0.0
2017-07-11 18:53:00,400018,TX,6,Yea,Rep. Joe Barton [R],Republican,House Vote,345,"H.R. 1397: To authorize, direct, facilitate, a...",Yea,0.0
2017-07-11 18:53:00,400029,UT,1,Yea,Rep. Rob Bishop [R],Republican,House Vote,345,"H.R. 1397: To authorize, direct, facilitate, a...",Yea,0.0
2017-07-11 18:53:00,400030,GA,2,Yea,Rep. Sanford Bishop Jr. [D],Democrat,House Vote,345,"H.R. 1397: To authorize, direct, facilitate, a...",Yea,0.0
2017-07-11 18:53:00,400032,TN,7,Yea,Rep. Marsha Blackburn [R],Republican,House Vote,345,"H.R. 1397: To authorize, direct, facilitate, a...",Yea,0.0


In [123]:
house.vote.value_counts()

In [191]:
months = {'June':6, 'July':7, 'August':8, 'September':9, 'October':10}

In [195]:
list(months.values())

[6, 7, 8, 9, 10]

In [126]:
house = house.loc[house.vote != 'Not Voting',:]
house.shape

In [174]:
mode = lambda x: x.mode() if len(x) > 2 else np.array(x)

In [128]:
house['most'] = house.groupby([house.index, 'number','party'])['vote'].transform(mode)

house.loc[house.vote == house.most, 'share_vote'] = 0 # vote as always
house.loc[house.vote != house.most, 'share_vote'] = 1 # vote different from party

In [158]:
house.share_vote.value_counts() 

0.0    77284
1.0     5774
Name: share_vote, dtype: int64

In [140]:
house.loc[(house.number=='347') & (house.party=='Democrat'),:]

In [137]:
house.loc[house.share_vote==1].head()

In [183]:
senate.resample('D').count()

Unnamed: 0_level_0,person,state,district,vote,name,party,chamber,number,title
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-07-10,192,192,0,192,192,192,192,192,192
2017-07-11,0,0,0,0,0,0,0,0,0
2017-07-12,200,200,0,200,200,200,200,200,200
2017-07-13,98,98,0,98,98,98,98,98,98
2017-07-14,0,0,0,0,0,0,0,0,0
2017-07-15,0,0,0,0,0,0,0,0,0
2017-07-16,0,0,0,0,0,0,0,0,0
2017-07-17,94,94,0,94,94,94,94,94,94
2017-07-18,99,99,0,99,99,99,99,99,99
2017-07-19,99,99,0,99,99,99,99,99,99


In [141]:
pd.crosstab(house.share_vote, house.party)

party,Democrat,Republican
share_vote,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,35943,41341
1.0,1414,4360


In [159]:
# data from the senate
LIMIT = 1000

session = FuturesSession(max_workers=5)
futures = [session.get('https://www.govtrack.us/congress/votes/115-2017/s%s/export/csv' 
                       %i) for i in range(LIMIT)]

In [160]:
files = []
for i in futures:
    files.append(i.result().text)

In [161]:
text = [f for f in files if (len(re.findall(r'DOCTYPE html', f)) == 0)]

In [163]:
senate_votes = create_votes_df(text)

In [164]:
senate_votes.shape

(21604, 10)

In [165]:
senate_votes.head()

Unnamed: 0,person,state,district,vote,name,party,chamber,date,number,title
0,300002,TN,,Yea,Sen. Lamar Alexander [R],Republican,Senate Vote,2017-01-04 12:38:00,1,Motion to Proceed on S.Con.Res. 3: A concurren...
1,300018,WA,,Nay,Sen. Maria Cantwell [D],Democrat,Senate Vote,2017-01-04 12:38:00,1,Motion to Proceed on S.Con.Res. 3: A concurren...
2,300019,DE,,Nay,Sen. Thomas Carper [D],Democrat,Senate Vote,2017-01-04 12:38:00,1,Motion to Proceed on S.Con.Res. 3: A concurren...
3,300023,MS,,Yea,Sen. Thad Cochran [R],Republican,Senate Vote,2017-01-04 12:38:00,1,Motion to Proceed on S.Con.Res. 3: A concurren...
4,300025,ME,,Yea,Sen. Susan Collins [R],Republican,Senate Vote,2017-01-04 12:38:00,1,Motion to Proceed on S.Con.Res. 3: A concurren...


In [166]:
senate_votes.set_index('date', inplace=True)
senate = senate_votes.loc['2017-7':'2017-9']

In [167]:
senate.vote.value_counts()

Yea           3585
Nay           1528
Not Voting     145
Present         43
Name: vote, dtype: int64

In [168]:
senate = senate.loc[senate.vote != 'Not Voting',:]
senate.shape

In [172]:
senate['most'] = senate.groupby([senate.index, 'number','party'])['vote'].transform(mode)

senate.loc[senate.vote == senate.most, 'share_vote'] = 0 # vote as always
senate.loc[senate.vote != senate.most, 'share_vote'] = 1 # vote different from party

ValueError: Wrong number of items passed 2, placement implies 50

In [157]:
senate.share_vote.value_counts()

0.0    77284
1.0     5774
Name: share_vote, dtype: int64

In [81]:
votes = house_votes.append(senate_votes)

# Get questions and votes

In [86]:
# senate

url  = 'https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_115_1.xml'
response = requests.get(url)
question = re.findall(r'<title>([^>].*[^<])</title>', response.content.decode('utf-8'))
vote_number = re.findall(r'<vote_number>([^>].*[^<])</vote_number>', response.content.decode('utf-8'))
vote_date = re.findall(r'<vote_date>([^>].*[^<])</vote_date>', response.content.decode('utf-8'))

senate = {'question' : question, 'number' : vote_number, 'date' : vote_date}
df_senate = pd.DataFrame(senate)

# house

url  = 'https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_115_1.xml'
response = requests.get(url)
question = re.findall(r'<title>([^>].*[^<])</title>', response.content.decode('utf-8'))
vote_number = re.findall(r'<vote_number>([^>].*[^<])</vote_number>', response.content.decode('utf-8'))
vote_date = re.findall(r'<vote_date>([^>].*[^<])</vote_date>', response.content.decode('utf-8'))

senate = {'question' : question, 'number' : vote_number, 'date' : vote_date}
df_senate = pd.DataFrame(senate)



In [117]:
from selenium import webdriver 
import time

driver = webdriver.PhantomJS(executable_path=phantom_path)
driver.get("https://www.govtrack.us/congress/votes") 
time.sleep(3)

NameError: name 'phantom_path' is not defined

In [116]:
driver.close()

NameError: name 'driver' is not defined

In [16]:
print(driver.find_element_by_xpath('//@href').text)

InvalidSelectorException: Message: {"errorMessage":"The result of the xpath expression \"//@href\" is: [object Attr]. It should be an element.","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"91","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:62304","User-Agent":"Python http auth"},"httpVersion":"1.1","method":"POST","post":"{\"using\": \"xpath\", \"value\": \"//@href\", \"sessionId\": \"aa7472b0-9b1b-11e7-bb4c-2b0fc7a27664\"}","url":"/element","urlParsed":{"anchor":"","query":"","file":"element","directory":"/","path":"/element","relative":"/element","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/element","queryKey":{},"chunks":["element"]},"urlOriginal":"/session/aa7472b0-9b1b-11e7-bb4c-2b0fc7a27664/element"}}
Screenshot: available via screen


In [13]:
print(driver.find_element_by_id("content").text) 
driver.close()

NoSuchElementException: Message: {"errorMessage":"Unable to find element with id 'content'","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"88","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:62243","User-Agent":"Python http auth"},"httpVersion":"1.1","method":"POST","post":"{\"using\": \"id\", \"value\": \"content\", \"sessionId\": \"22f7fff0-9b1b-11e7-b6ff-532d6bfcaca6\"}","url":"/element","urlParsed":{"anchor":"","query":"","file":"element","directory":"/","path":"/element","relative":"/element","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/element","queryKey":{},"chunks":["element"]},"urlOriginal":"/session/22f7fff0-9b1b-11e7-b6ff-532d6bfcaca6/element"}}
Screenshot: available via screen


In [11]:
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.PhantomJS(executable_path=phantom_path) 
driver.get("") 

try:
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "loadedButton"))) 
finally:
    print(driver.find_element_by_id("content").text)
driver.close()

Here is some important text you want to retrieve!
A button to click!


In [40]:
url = 'https://www.govtrack.us/congress/votes/115-2017/h361/export/csv'

data = pd.read_csv('https://www.govtrack.us/congress/votes/115-2017/h361/export/csv', skiprows=1)
# cr = csv.reader(response)

# for row in cr: 
#     print(row)
# cr = csv.reader(open('https://www.govtrack.us/congress/votes/115-2017/h361/export/csv',"rb"))



# for i in range(100):
#    url = 'https://www.govtrack.us/congress/votes/115-2017/h%s/export/csv' %(i+1)
#    data = r
#    temp = pd.read_csv(url, skiprows = 1)
# data

In [92]:
d = {'1' : 20, '2' :30} 
list(d.items())

In [211]:
import plotly.plotly as py

In [212]:
import plotly
plotly.tools.set_credentials_file(username='sdaza', api_key='Pafrm8TptP1ZQiC9CEva')
import numpy as np

# Add histogram data
x1 = np.random.randn(200)-2  
x2 = np.random.randn(200)  
x3 = np.random.randn(200)+2  
x4 = np.random.randn(200)+4  

# Group data together
hist_data = [x1, x2, x3, x4]

group_labels = ['Group 1', 'Group 2', 'Group 3', 'Group 4']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.2)

# Plot!
py.iplot(fig, filename='Distplot with Multiple Datasets')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~sdaza/0 or inside your plot.ly account where it is named 'Distplot with Multiple Datasets'
