Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
1307 lines (1064 sloc) 33.4 KB
## pyfec.py
## Python analysis of FEC data as provided by CIR via Kaggle
## October, 2012
##
## v. 0.1
## Rik Belew (rik@electronicArtifacts.com)
## 27 Oct 12
import _mysql
import csv
import cPickle as pickle
import re
import random
import math
DataDir = '/Data/corpora/kaggle_CIR/'
## Database variables & routines
SchemaTableNames = [] # list of table names
SchemaColumns = {} # tblName -> [columnNameList]
TableLoadOrder = ('fec_itemized_cand_2012',
'fec_itemized_comm_2012',
'fec_itemized_ccl_2012',
'fec_itemized_indiv_2012',
'fec_itemized_pas_2012',
'fec_itemized_oth_2012'
)
DBSpecTbl = { \
# rootName -> (idAttr, [ (fkeyAttr,fObj,foAttr,add?) ], [keepAttr] )
"cand": ('candidate_id',[],["candidate_name", "party", "candidate_state", "candidate_office", \
"current_district", "incum_challenger_openseat", "candidate_status", \
"principle_campaign_committee_id", "city", "state", "zip", "fec_election_year"]),
"comm": ('committee_id',[],["committee_name", "treasurers_name", "city", "state", "zip", "nicar_election_year"]),
"ccl": ("linkage_id",[("candidate_id","cand","candidate_id",False), ("committee_id","comm","committee_id",False)], \
["fec_election_year"]),
"indiv": ("transaction_id", [("filer_id","comm","committee_id",False),("contributor_name","contrib","name",True)], \
["transaction_type", "nicar_date","amount","other_id", "nicar_election_year"]),
"oth": ("transaction_id", [("filer_id","comm","committee_id",False),("contributor_name","contrib","name",True)], \
["transaction_type", "nicar_date","amount","other_id", "nicar_election_year"]),
"pas": ("transaction_id", [("filer_id","comm","committee_id",False),("candidate_id","cand","candidate_id",False)],
["transaction_type", "nicar_date","amount","other_id", "nicar_election_year"])
}
class Cand():
pass
class Ccl():
pass
class Comm():
pass
class Indiv():
pass
class Oth():
pass
class Pas():
pass
class Contrib():
pass
CandTbl = {}
CclTbl = {}
CommTbl = {}
IndivTbl = {}
OthTbl = {}
PasTbl = {}
ContribTbl = {}
AllTbls = {}
AllTblNames = ['cand', 'comm', 'contrib', 'ccl', 'indiv', 'pas', 'oth']
class DBConn():
def __init__(self,dbName=''):
self.currDB = _mysql.connect(host="localhost",user="root", passwd="PWHERE", db=dbName)
def get_tableNames(self):
self.currDB.query('show tables;')
r=self.currDB.store_result()
tblNames = []
for ri in range(r.num_rows()):
f = r.fetch_row()
# (('fec_itemized_cand_2012',),)
tname = f[0][0]
tblNames.append(tname)
return tblNames
def get_colNames(self,tblName):
self.currDB.query("select column_name from information_schema.columns where table_name = '%s';" % tblName)
r=self.currDB.store_result()
colNames = []
for ri in range(r.num_rows()):
f = r.fetch_row()
# (('fec_itemized_cand_2012',),)
cname = f[0][0]
colNames.append(cname)
return colNames
def reconn(dbName):
db = DBConn(dbName)
tblNames = db.get_tableNames()
for tname in tblNames:
print '*',tname
for cname in db.get_colNames(tname):
print cname
def getRootName(tname):
# 'fec_itemized_cand_2012' -> 'cand'
return tname[13:-5]
def testCandID(rid,row,colNames):
idoff = rid[0]
idstate = rid[2:4]
iddist = rid[4:6]
off = row[colNames.index("candidate_office")]
state = row[colNames.index("state")]
dist = row[colNames.index("current_district")]
rv = idoff==off and idstate==state and iddist==dist
# if not rv:
# print 'odd CandID?!',rid,off,state,dist
return rv
def bldPyDict(db,tname,trn):
'return dictionary of _id -> tname objects'
db.currDB.query("select * from %s;" % tname)
r=db.currDB.store_result()
nrows = r.num_rows()
print 'bldPyDict: Table %s has %d rows' % (trn,nrows)
colNames = SchemaColumns[trn]
# ASSUME first attribute is UID
assert colNames[0].endswith('_id'), 'ASSUME first attribute is UID?!'
logicSpec = DBSpecTbl[trn]
# (idAttr, [ (fkeyAttr,fObj,foAttr,add?) ], [keepAttr] )
idAttr, fkeyList, keepList = logicSpec
newTbl = {}
allForeignRefTbl = {}
nRptErr = 5
nmiss=0
nadd=0
ndup=0
nnull=0
nnullfkey=0
ncull=0
ncomm=0
ncand=0
noddCand=0
# NB: silently ignore non-2012 entries
if trn=='comm' or trn=='indiv' or trn=='oth' or trn=='pas':
yrAttrib = "nicar_election_year"
else:
yrAttrib = "fec_election_year"
for ri in range(nrows):
row = r.fetch_row()[0] # ?? why first element of row?
assert len(row)==len(colNames), '#attribues != #values?!'
# NB: silently ignore non-2012 entries
idx = colNames.index(yrAttrib)
eyear = row[ idx ]
if eyear != '2012':
continue
idx = colNames.index(idAttr)
rid = row[ idx ]
rid = rid.strip()
if rid=='':
if nnull < nRptErr:
print 'bldPyDict: null id?! %s: currRow=%d' % (trn,ri)
nnull += 1
continue
if rid in newTbl:
if ndup < nRptErr:
print 'bldPyDict: dup id?! %s:%s currRow=%d prevRow=%d' % (trn,rid,ri,newTbl[rid].rowNum)
ndup += 1
continue
if trn=='cand':
match = testCandID(rid,row,colNames)
if not match:
noddCand += 1
# NB: avoid double counting PAS contributions included in other
# ASSUME pas loaded before oth
if (trn=='oth' and rid in AllTbls['pas']):
if ndup < nRptErr:
print 'bldPyDict: oth dup with pas id?! %s:%s currRow=%d' % (trn,rid,ri)
ndup += 1
continue
newObj = None # to kill pydev syntax errors
currfo0 = None
currfo1 = None
currfobj1 = None # actual object retrieved
# ASSUME 2 foreign keys ==> RELATION between foreign objects
# create newObj of type trn with id and any additional attributes
# add its ID to lists in both foreign objects
if len(fkeyList)==2:
fkTbl = {}
fka0,fo0,foa0,addP0 = fkeyList[0]
fka1,fo1,foa1,addP1 = fkeyList[1]
currForeignTbl0 = AllTbls[fo0]
currForeignTbl1 = AllTbls[fo1]
idx = colNames.index(fka0)
fkey0 = row[ idx ]
fkey0 = fkey0.strip()
if fkey0=='':
if nnullfkey < nRptErr:
print 'bldPyDict: null fkey0?! %s:%s currRow=%d' % (trn,rid,ri)
nnullfkey += 1
continue
if fkey0 in currForeignTbl0:
# cull dormant committees!
if fo0=='comm' and fkey0 in Comm2CullTbl:
ncull += 1
continue
currfo0 = fkey0
elif addP0:
if nadd < nRptErr:
print 'bldPyDict: adding new %s:%s from %s:%s' % (fo0,fkey0,trn,rid)
nadd += 1
newFObj = eval('%s()' % (fo0.capitalize()))
setattr(newFObj,"id",fkey0)
currForeignTbl0[fkey0] = newFObj
currfo0 = fkey0
else:
if nadd < nRptErr:
print 'bldPyDict: missing fkey0 %s:%s from %s:%s' % (fo0,fkey0,trn,rid)
fobj0 = None
nmiss += 1
# NB: for indiv and other, prefer other_id if it exists over free-text contrib name
# 2do: what about pas with other_id?!
if trn=='indiv' or trn=='oth':
idx = colNames.index('other_id')
fkey1 = row[ idx ]
fkey1 = fkey1.strip()
if fkey1!='':
if fkey1.startswith('C'):
if fkey1 in AllTbls['comm']:
currfo1 = fkey1
currfobj1 = AllTbls['comm'][currfo1]
ncomm += 1
elif fkey1.startswith('P') or fkey1.startswith('S') or fkey1.startswith('H'):
if fkey1 in AllTbls['cand']:
currfo1 = fkey1
currfobj1 = AllTbls['cand'][currfo1]
ncand += 1
else:
print 'bldPyDict: odd other_id?! %s in %s:%s' % (fkey1,trn,rid)
# if trn=='indiv' or trn=='oth' and haven't found valid other_id, proceed normally
if not currfo1:
idx = colNames.index(fka1)
fkey1 = row[ idx ]
fkey1 = fkey1.strip()
if fkey1=='':
if nnullfkey < nRptErr:
print 'bldPyDict: null fkey1?! %s:%s currRow=%d' % (trn,rid,ri)
nnullfkey += 1
continue
if fkey1 in currForeignTbl1:
# cull dormant committees!
if fo1=='comm' and fkey1 in Comm2CullTbl:
ncull += 1
continue
currfo1 = fkey1
currfobj1 = currForeignTbl1[currfo1]
elif addP1:
if nadd < nRptErr:
print 'bldPyDict: adding new %s:%s from %s:%s' % (fo1,fkey1,trn,rid)
nadd += 1
newFObj = eval('%s()' % (fo1.capitalize()))
setattr(newFObj,"id",fkey1)
currForeignTbl1[fkey1] = newFObj
currfo1 = fkey1
currfobj1 = currForeignTbl1[currfo1]
else:
if nmiss < nRptErr:
print 'bldPyDict: mising fkey %s:%s from %s:%s' % (fo1,fkey1,trn,rid)
nmiss += 1
## error checking will have caused some to 'continue' before this
## those with foreign keys have additional criteria
if len(fkeyList)==2 and not(currfo0 and currfo1):
continue
exec('newObj = %s()' % (trn.capitalize()))
setattr(newObj,"id",rid)
setattr(newObj,'rowNum',ri)
if len(keepList)>0:
for ka in keepList:
idx = colNames.index(ka)
val = row[ idx ]
setattr(newObj,ka,val)
if currfo0 and currfo1:
setattr(newObj,fo0,currfo0)
setattr(newObj,fo1,currfo1)
fobj0 = currForeignTbl0[currfo0]
currList = getattr(fobj0,trn,[])
currList.append(rid)
setattr(fobj0,trn,currList)
# NB, currfobj1 bound above, as might be of different types
currList = getattr(currfobj1,trn,[])
currList.append(rid)
setattr(currfobj1,trn,currList)
newTbl[rid] = newObj
print 'bldPyDict: done. N_%s=%d NDup=%d NMiss=%d NAdd=%d NCull=%d NNull=%d NNullFK=%d NCand' % (trn,len(newTbl),ndup,nmiss,nadd,ncull,nnull,nnullfkey)
if trn=='cand':
print '\tNoddCand=%d' % (noddCand)
return newTbl
Comm2CullTbl = {}
def parseAll(dbName,commCullFile=None):
db = DBConn(dbName)
global SchemaTableNames
SchemaTableNames = db.get_tableNames()
global SchemaColumns
for tname in SchemaTableNames:
rootName = getRootName(tname)
# NB rootNames are keys, tname needed by db
SchemaColumns[rootName] = db.get_colNames(tname)
global AllTbls
AllTbls['contrib'] = ContribTbl
if commCullFile:
dormantFile = DataDir+commCullFile
print ('parseAll: loading dormant committees from %s...' % dormantFile),
global Comm2CullTbl
Comm2CullTbl = {}
csvDictReader = csv.DictReader(open(dormantFile,"r"),delimiter="\t")
for i,entry in enumerate(csvDictReader):
cid = entry['CommID']
cname = entry['Comm.Name']
Comm2CullTbl[cid] = cname
Comm2CullTbl[cid]
print 'done. NDormant=%d' % (len(Comm2CullTbl))
for tname in TableLoadOrder:
rootName = getRootName(tname)
currTbl = bldPyDict(db,tname,rootName)
AllTbls[rootName] = currTbl
print 'done.'
for tblName in AllTbls.keys():
tbl = AllTbls[tblName]
pklFile = DataDir+'%sTbl.pkl' % tblName
print 'Pickling %d to %s...' % (len(tbl),pklFile)
# can't use cPickle, cuz of my special class objects (:
pstr = open(pklFile, 'w')
pickle.Pickler(pstr).dump(tbl)
pstr.close()
tbl = {}
del AllTbls[tblName]
def loadAllTbls():
global AllTbls
AllTbls = {}
for tblName in AllTblNames:
pklFile = DataDir+'%sTbl.pkl' % tblName
print ('Loading from %s...' % pklFile),
pstr = open(pklFile)
tbl = pickle.Unpickler(pstr).load()
AllTbls[tblName] = tbl
pstr.close()
print ' done. N=%d' % len(tbl)
print 'loadAllTbls: done.'
def loadOneTbl(tname):
pklFile = DataDir+'%sTbl.pkl' % tname
print ('Loading from %s...' % pklFile),
pstr = open(pklFile)
tbl = pickle.Unpickler(pstr).load()
pstr.close()
print ' done. N=%d' % len(tbl)
return tbl
## Analysis routines
def analCand():
'''handle against Type24 payments with commid_
drop NTopComm threshold
maintain committee cumms'''
global AllTbls
# 2do: collapse all these loadOneTbl() paras to function, but with global consequence!?
if 'cand' in AllTbls:
candTbl = AllTbls['cand']
else:
candTbl = loadOneTbl('cand')
AllTbls['cand'] = candTbl
if 'pas' in AllTbls:
pasTbl = AllTbls['pas']
else:
pasTbl = loadOneTbl('pas')
AllTbls['pas'] = pasTbl
outs = open(DataDir+'candSumm.csv','w')
outs.write('CandID,Name,Party,State,Office,District,Comm1,NComm,Tot,NContrib,NegTot,NNeg\n')
outs2 = open(DataDir+'cand2comm.csv','w')
outs2.write('CandID,CommID,Amt\n')
nzero = 0
ncand = 0
nedge=0
cummCommTbl = {}
# negAmtTbl = {} # typeCode -> Total
for cid,cand in candTbl.items():
tot = 0
antitot = 0
commTbl = {}
ncontrib = 0
nanti=0
comm1 = cand.principle_campaign_committee_id
if not ('pas' in dir(cand) and len(cand.pas)>0):
nzero += 1
continue
ncand += 1
ncontrib = len(cand.pas)
for contrib in cand.pas:
# ASSUME it's there
pas = pasTbl[contrib]
ptype = pas.transaction_type
amt = int(round(float(pas.amount)))
commID = pas.comm
negAmtFnd=False
# http://influenceexplorer.com/about/methodology/campaign_finance
# discovered 24 Oct 12 !
#
# In deciding what contributions to include, we attempt to follow the methodology of CRP and NIMSP:
#
# For federal contributions from individuals, we only count contributions with an FEC transaction type of 10, 11, 15, 15e, 15j or 22y.
# For federal contributions from organizations, we only count contributions with an FEC transaction type of 24k, 24r or 24z.
# We ignore all contributions with a category code of starting with 'Z', except for codes beginning with 'Z90' (candidate self-contributions), which are included.
# treat neg amounts as anti
if amt < 0:
amt = - amt
negAmtFnd=True
# NB: "against" transaction types
if negAmtFnd or ptype == '24A' or ptype == '24N':
commID += '_'
antitot += amt
nanti += 1
else:
tot += amt
if commID in commTbl:
commTbl[commID] += amt
else:
commTbl[commID] = amt
commKeys = commTbl.keys()
cname = cand.candidate_name
cname = cname.replace('"',"''")
outs.write('%s,"%s",%s,%s,%s,%s,%s,%d,%d,%d,%d,%d\n' % \
(cid,cname, cand.party, cand.candidate_state, \
cand.candidate_office, cand.current_district, comm1, \
len(commTbl),tot,ncontrib, antitot,nanti))
for ck in commKeys:
nedge+=1
outs2.write('%s,%s,%d\n' % (cid,ck,commTbl[ck]))
if ck in cummCommTbl:
cummCommTbl[ck] += commTbl[ck]
else:
cummCommTbl[ck] = commTbl[ck]
outs.close()
outs2.close()
print 'analCand: NCand=%d Nzero=%d NEdge=%d' % (ncand,nzero,nedge)
# see /Data/corpora/kaggle_CIR/logs/analCand2_121024.log
# print '# PType Amt'
# for ptype,amt in negAmtTbl.items():
# print ptype,amt
# Create committee summary, combining pos and anti contrib
combineCommTbl = {}
for k,v in cummCommTbl.items():
if k.endswith('_'):
basek = k[:-1]
anti = 1
else:
basek = k
anti = 0
currSumms = combineCommTbl.get(basek, [0,0] )
currSumms[anti] = cummCommTbl[k]
combineCommTbl[basek] = currSumms
outs3 = open(DataDir+'commCandTot.csv','w')
outs3.write('CommID,Pos,Neg,Tot\n')
commKeys = combineCommTbl.keys()
commKeys.sort()
for commID in commKeys:
pos = combineCommTbl[commID][0]
neg = combineCommTbl[commID][1]
outs3.write('%s,%d,%d,%d\n' % (commID,pos,neg,pos+neg) )
outs3.close()
SmallGivePrefix = 'otherStrata'
def analTopContrib(ntop=300):
global AllTbls
if 'contrib' in AllTbls:
contribTbl = AllTbls['contrib']
else:
contribTbl = loadOneTbl('contrib')
AllTbls['contrib'] = contribTbl
if 'indiv' in AllTbls:
indivTbl = AllTbls['indiv']
else:
indivTbl = loadOneTbl('indiv')
AllTbls['indiv'] = indivTbl
if 'oth' in AllTbls:
othTbl = AllTbls['oth']
else:
othTbl = loadOneTbl('oth')
AllTbls['oth'] = othTbl
contribList = []
allContrib = 0
for cid,contrib in contribTbl.items():
itot = 0
nindiv=0
otot = 0
nother=0
if 'indiv' in dir(contrib):
nindiv = len(contrib.indiv)
for contrib2 in contrib.indiv:
indiv = indivTbl[contrib2]
amt = int(round(float(indiv.amount)))
if amt<0:
amt = -amt
itot += amt
if 'oth' in dir(contrib):
nother= len(contrib.oth)
for contrib2 in contrib.oth:
oth = othTbl[contrib2]
amt = int(round(float(oth.amount)))
if amt<0:
amt = -amt
otot += amt
bothTot = itot+otot
allContrib += (itot+otot)
contribList.append( (cid,bothTot,nindiv,itot,nother,otot) )
ncontrib = len(contribTbl)
assert ncontrib == len(contribList), 'analTopContrib: dropped contrib?!'
print 'analTopContrib: allContrib = %e NContrib=%d ' % (allContrib,len(contribList))
print 'analTopContrib: sorting contribList...'
contribList.sort(key=(lambda x: x[1]),reverse=True)
contribRpt = DataDir+'topContrib.csv'
print 'writing to ',contribRpt
outs = open(contribRpt,'w')
outs.write('# Contrib,NIndiv,ITot,NOther,OTot,BothTot\n')
for ci, cinfo in enumerate(contribList[:ntop]):
cid,bothtot,nindiv,itot,nother,otot = cinfo
outs.write('"%s",%d,%d,%d,%d,%d\n' % (cid,nindiv,itot,nother,otot,bothtot))
outs.close()
def analContrib(computeStrata=False,outOtherComm=False,filterContrib=False):
'''v2: ASSUME contrib4comm_top and cand2comm lists already build
form cohorts of BigSpenders and then strata of others into
FracPercent bands
create pseudo contribNames for all but biggest spenders
and produce contrib,committee,amt records for graph
'''
c4cTbl = {}
c2cTbl = {}
if filterContrib:
contrib4commf = '/Data/corpora/kaggle_CIR/analData/contrib4comm_top.csv'
csvDictReader = csv.DictReader(open(contrib4commf,"r"))
for i,entry in enumerate(csvDictReader):
# CommID,CommName,Tot
cid = entry['CommID']
c4cTbl[cid] = int(entry['Tot'])
cand2commf = '/Data/corpora/kaggle_CIR/analData/cand2comm.csv'
csvDictReader = csv.DictReader(open(cand2commf,"r"))
for i,entry in enumerate(csvDictReader):
# CandID,CommID,Amt
# Source,Target,Weight
cid = entry['Target']
c2cTbl[cid] = (cid,entry['Amt'])
global AllTbls
if 'contrib' in AllTbls:
contribTbl = AllTbls['contrib']
else:
contribTbl = loadOneTbl('contrib')
AllTbls['contrib'] = contribTbl
if 'indiv' in AllTbls:
indivTbl = AllTbls['indiv']
else:
indivTbl = loadOneTbl('indiv')
AllTbls['indiv'] = indivTbl
if 'oth' in AllTbls:
othTbl = AllTbls['oth']
else:
othTbl = loadOneTbl('oth')
AllTbls['oth'] = othTbl
if 'comm' in AllTbls:
commTbl = AllTbls['comm']
else:
commTbl = loadOneTbl('comm')
AllTbls['comm'] = commTbl
NStrata = 10
# 23 Oct 12
StrataThreshVec = [1000, 2185, 3500, 6000, 10900, 25000, 48700, 80100, 144700]
# V3: provide for pre-computed StrataThreshVec
if computeStrata:
contribList = []
allContrib = 0
for cid,contrib in contribTbl.items():
bothTot = 0
if 'indiv' in dir(contrib):
for contrib2 in contrib.indiv:
# ASSUME it's there
indiv = indivTbl[contrib2]
amt = int(round(float(indiv.amount)))
# NB: what are negative numbers about?!
# but they only total $1.5e6; ignore
if amt<0:
amt = -amt
bothTot += amt
if 'oth' in dir(contrib):
for contrib2 in contrib.oth:
# ASSUME it's there
oth = othTbl[contrib2]
amt = int(round(float(oth.amount)))
# NB: what are negative numbers about?!
# but they only total $1.5e6; ignore
if amt<0:
amt = -amt
bothTot += amt
allContrib += bothTot
contribList.append( (cid,bothTot) )
ncontrib = len(contribTbl)
assert ncontrib == len(contribList), 'analContrib2: dropped contrib?!'
print 'analContrib2: allContrib = %e NContrib=%d ' % (allContrib,len(contribList))
print 'analContrib2: sorting contribList...'
contribList.sort(key=(lambda x: x[1]))
runSum = 0
stratSum = 0
si = 0
stratThreshFrac = allContrib / float(NStrata)
strataVec = [0 for i in range(NStrata)]
# computed 18 Oct 12
zeroVal = True
print '# ci,amt,stratSum,runSum'
for ci, cinfo in enumerate(contribList):
cid,amt = cinfo
if zeroVal and amt < 1:
continue
elif zeroVal:
print 'first %d zero!' % ci
zeroVal = False
stratSum += amt
runSum += amt
if stratSum >= stratThreshFrac:
strataVec[si] = amt
print '%d,%d,%d,%d' % (ci,amt,stratSum,runSum)
stratSum = 0
si += 1
if strataVec != StrataThreshVec:
print 'analContrib2: new strataVec != StrataThreshVec?!\n\tnew=%s\n\tcached=%s' % (strataVec,StrataThreshVec)
def getStrataIdx(samt,svec=StrataThreshVec):
'''return first i s.t. svec[i] > amt
last category of BigSpenders > svec[-1] return -1
'''
for i,amt in enumerate(svec):
if samt < amt:
return i
return -1
TestVec = [10,1500,2500,5001,7500,10000,20000,40000,100000,1e6]
# 23 Oct 12
TestStrataVec = [0, 1, 2, 3, 4, 4, 5, 6, 8, -1]
assert TestStrataVec == [getStrataIdx(amt) for amt in TestVec]
## End of strata code
contribRpt = DataDir+'contribSumm.csv'
print 'writing to ',contribRpt
outs = open(contribRpt,'w')
outs.write('# Contrib,NIndiv,ITot,NOther,OTot\n')
print 'analContrib2: Aggregating across contributions'
outStrataTbl = {}
# Accumulate across committees
for cid,contrib in contribTbl.items():
nicontrib = 0
itot=0
nocontrib = 0
otot=0
con4comTbl = {}
if 'indiv' in dir(contrib):
nicontrib = len(contrib.indiv)
for contrib in contrib.indiv:
indiv = indivTbl[contrib]
amt = int(round(float(indiv.amount)))
commID = indiv.comm
# NB: only output records to 'domoinant' committees wrt/ contrib4comm or comm2cand
if filterContrib and not(commID in c4cTbl or commID in c2cTbl):
if outOtherComm:
commID = 'OtherComm'
else:
continue
# NB: "against" transaction types
if indiv.transaction_type == '24A' or indiv.transaction_type == '24N':
commID = commID+'_'
if amt<0:
amt = -amt
itot += amt
if commID in con4comTbl:
con4comTbl[commID] += amt
else:
con4comTbl[commID] = amt
if 'oth' in dir(contrib):
nocontrib = len(contrib.oth)
for contrib in contrib.oth:
oth = othTbl[contrib]
amt = int(round(float(oth.amount)))
commID = oth.comm
# NB: only output records to 'domoinant' committees wrt/ contrib4comm or comm2cand
if filterContrib and not(commID in c4cTbl or commID in c2cTbl):
if outOtherComm:
commID = 'OtherComm'
else:
continue
# NB: "against" transaction types
if oth.transaction_type == '24A' or oth.transaction_type == '24N':
commID = commID+'_'
if amt<0:
amt = -amt
otot += amt
if commID in con4comTbl:
con4comTbl[commID] += amt
else:
con4comTbl[commID] = amt
outs.write('"%s",%d,%d,%d,%d\n' % (cid,nicontrib,itot,nocontrib,otot))
totContrib = itot + otot
strataIdx = getStrataIdx(totContrib)
outContrib = None
if strataIdx == -1:
outContrib = cid
# print outContrib,totContrib,len(con4comTbl)
else:
outContrib = '%s_%s' % (SmallGivePrefix,strataIdx)
if outContrib not in outStrataTbl:
outStrataTbl[outContrib] = {}
for commID in con4comTbl.keys():
amt = con4comTbl[commID]
if commID in outStrataTbl[outContrib]:
outStrataTbl[outContrib][commID] += amt
else:
outStrataTbl[outContrib][commID] = amt
outs.close()
contrib2commf = DataDir+'contrib2comm.csv'
print 'analContrib2: outputing %d contributors to %s' %(len(outStrataTbl),contrib2commf)
nedge=0
outs = open(contrib2commf,'w')
outs.write('ContribID,CommID,Amt\n')
for contrib,commTbl in outStrataTbl.items():
for commID in commTbl.keys():
amt = outStrataTbl[contrib][commID]
outs.write('"%s",%s,%d\n' % (contrib,commID,amt))
nedge += 1
outs.close()
print 'analContrib2 done. NEdge=%d' % (nedge)
def analComm():
global AllTbls
if 'comm' in AllTbls:
commTbl = AllTbls['comm']
else:
commTbl = loadOneTbl('comm')
AllTbls['comm'] = commTbl
if 'indiv' in AllTbls:
indivTbl = AllTbls['indiv']
else:
indivTbl = loadOneTbl('indiv')
AllTbls['indiv'] = indivTbl
if 'oth' in AllTbls:
othTbl = AllTbls['oth']
else:
othTbl = loadOneTbl('oth')
AllTbls['oth'] = othTbl
if 'pas' in AllTbls:
pasTbl = AllTbls['pas']
else:
pasTbl = loadOneTbl('pas')
AllTbls['pas'] = pasTbl
nindiv = 0
noth = 0
npas = 0
nzero=0
ncomm=0
nc2cedge=0
outs = open(DataDir+'commSumm.csv','w')
# HACK: bars used because commas in committee names(:
outs.write('CID,Name,NIndiv,ISum,NOther,OSum,NPas,PSum\n')
for cid,comm in commTbl.items():
indivLen=0
othLen=0
pasLen=0
isum = 0
osum = 0
psum = 0
try:
if 'indiv' in dir(comm):
indivLen=len(comm.indiv)
nindiv += 1
for indivID in comm.indiv:
indiv = indivTbl[indivID]
isum += float(indiv.amount)
if 'oth' in dir(comm):
othLen=len(comm.oth)
noth += 1
for othID in comm.oth:
oth = othTbl[othID]
osum += float(oth.amount)
nc2cedge += othLen
if 'pas' in dir(comm):
pasLen=len(comm.pas)
npas += 1
for pasID in comm.pas:
pas = pasTbl[pasID]
psum += float(pas.amount)
# HACK: bars used because commans in committee names(:
if indivLen+othLen+pasLen==0:
nzero += 1
else:
ncomm += 1
outs.write('%s,"%s",%d,%d,%d,%d,%d,%d\n' % \
(cid,comm.committee_name,indivLen,int(round(isum)),othLen,int(round(osum)),pasLen,int(round(psum))))
except Exception,e:
print 'Err? cid=%s name=%s e=%s' % (cid,comm.committee_name,e)
outs.close()
print 'analComm: NComm=%d nzero=%d NC2CEdge=%d' % (ncomm,nzero,nc2cedge)
def bldRaceTbl():
'raceID -> [(candID,state,party,ico)]'
candTbl = loadOneTbl('cand')
raceTbl = {}
for cid,cand in candTbl.items():
party = cand.party.strip()
state = cand.candidate_state.strip()
office = cand.candidate_office.strip()
district = cand.current_district.strip()
# raceID = '_'.join([state,office,district])
raceID = '%s_%s%s' % (office,state,district)
ico = cand.incum_challenger_openseat
if raceID in raceTbl:
raceTbl[raceID].append( (cid,state,party,ico) )
else:
raceTbl[raceID] = [ (cid,state,party,ico) ]
print '# RaceTbl\nRaceID,NCand'
for raceID in raceTbl.keys():
# if len(raceTbl[raceID]) != 2:
# print 'odd race ~2?!',raceID,raceTbl[raceID]
# else:
# parties = [cinfo[1] for cinfo in raceTbl[raceID] ]
# parties = parties.sort()
# if parties != ['DEM','REP']:
# print 'odd race D/R?!',raceID,raceTbl[raceID]
# else:
# nnorm += 1
print '%s,%d' % (raceID,len(raceTbl[raceID]))
# print 'bldRaceTbl: NRace=%d NNorm=%d' % (len(raceTbl),nnorm)
print 'bldRaceTbl: NRace=%d' % (len(raceTbl))
return raceTbl
def bldGraph(contribTopCommFile,candTopCommFile,candComm1File):
'''create nodes for all contrib and cand from contrib2comm and cand2comm edge sets, resp
ASSUME subsetComm: use provided files as filters: committees NOT in ANY of
these sets are dropped
include comm2comm other links only among this set
fold commID and commID_ nodes together but add "anti" edge type attribute
collect displayable node attributes
'''
## Load tables for committees to be kept
nodeTbl = {} # id -> [type, attribList, edgeList]
keepCommTbl = {}
for fi,inf in enumerate([contribTopCommFile,candTopCommFile,candComm1File]):
csvDictReader = csv.DictReader(open(DataDir+'analData/'+inf,"r"))
for i,entry in enumerate(csvDictReader):
commID = entry['CommID']
if commID not in keepCommTbl:
keepCommTbl[commID] = fi
nodeTbl[commID] = ['comm',[] ]
ndropEdge = 0
dropComm2comm = 0
## build graph associated with contrib2comm
csvDictReader = csv.DictReader(open(DataDir+'analData/'+'contrib2comm.csv',"r"))
for i,entry in enumerate(csvDictReader):
# ContribID,CommID,Amt
contribID = entry['ContribID']
# comment lines
if contribID.startswith('# '):
continue
anti = 0
commID = entry['CommID']
if commID.endswith('_'):
anti = 1
commID = commID[:-1]
if commID not in keepCommTbl:
ndropEdge += 1
continue
amt = int(entry['Amt'])
if contribID not in nodeTbl:
nodeTbl[contribID] = ['contrib',[] ]
nodeTbl[contribID][1].append( (commID,amt,anti) )
## build graph associated with cand2comm
csvDictReader = csv.DictReader(open(DataDir+'analData/'+'cand2comm.csv',"r"))
for i,entry in enumerate(csvDictReader):
# CandID,CommID,Amt
candID = entry['CandID']
# comment lines
if candID.startswith('# '):
continue
anti = 0
commID = entry['CommID']
if commID.endswith('_'):
anti = 1
commID = commID[:-1]
if commID not in keepCommTbl:
ndropEdge += 1
continue
amt = int(entry['Amt'])
if candID not in nodeTbl:
nodeTbl[candID] = ['cand',[], ]
# NB: edge directed FROM comm TO cand
nodeTbl[commID][1].append( (candID,amt,anti))
## Add comm2comm edges
global AllTbls
if 'oth' in AllTbls:
othTbl = AllTbls['oth']
else:
othTbl = loadOneTbl('oth')
AllTbls['oth'] = othTbl
c2cTbl = {}
for oid,oth in othTbl.items():
comm1ID = oth.comm
comm2ID = oth.contrib
amt = int(round(float(oth.amount)))
negAmtFnd = False
if amt < 0:
amt = - amt
negAmtFnd=True
if not(comm1ID in keepCommTbl and comm2ID in keepCommTbl):
# NB: can't count dropped comm2comm edges here(:
dropComm2comm += amt
continue
otype = oth.transaction_type
# NB: "against" transaction types
if negAmtFnd or otype == '24A' or otype == '24N':
comm1ID += '_'
c2ck = (comm1ID,comm2ID)
if c2ck in c2cTbl:
c2cTbl[c2ck] += amt
else:
c2cTbl[c2ck] = amt
for c2ck,amt in c2cTbl.items():
(comm1ID,comm2ID) = c2ck
anti = 0
if comm1ID.endswith('_'):
anti = 1
comm1ID = comm1ID[:-1]
nodeTbl[comm1ID][1].append( (comm2ID,amt,anti))
ndTypeTbl={}
for k,nd in nodeTbl.items():
ndType = nd[0]
ndTypeTbl[ndType] = ndTypeTbl.get(ndType,0)+1
print 'bldGraph: NNodes=%d NDrop=%d NC2CAmtDrop=%d' % (len(nodeTbl),ndropEdge,dropComm2comm)
for ndType in ndTypeTbl.keys():
print '\t%s %d' % (ndType,ndTypeTbl[ndType])
global AllTbls
if 'contrib' in AllTbls:
contribTbl = AllTbls['contrib']
else:
contribTbl = loadOneTbl('contrib')
AllTbls['contrib'] = contribTbl
if 'comm' in AllTbls:
commTbl = AllTbls['comm']
else:
commTbl = loadOneTbl('comm')
AllTbls['comm'] = commTbl
if 'cand' in AllTbls:
candTbl = AllTbls['cand']
else:
candTbl = loadOneTbl('cand')
AllTbls['cand'] = candTbl
candKeyTbl = {'H': [], 'P': [], 'S':[]}
conList = []
for nid,nd in nodeTbl.items():
ndType,nbrList = nd
try:
if ndType == 'cand':
assert len(nbrList)==0, "cand don't have out neighbors?!"
cand = candTbl[nid]
label = cand.candidate_name
party = cand.party
office = cand.candidate_office
nodeTbl[nid].append( (0,ndType,label,party,office) )
candKeyTbl[office].append(nid)
elif ndType == 'comm':
comm = commTbl[nid]
label = comm.committee_name
state = comm.state
zip = comm.zip
totOut = sum([amt for (id2,amt,anti) in nbrList ])
nodeTbl[nid].append( (totOut,ndType,label,state,zip) )
elif ndType == 'contrib':
totOut = sum([amt for (id2,amt,anti) in nbrList ])
label = nid
nodeTbl[nid].append( (totOut,ndType,label,None,None))
conList.append( (nid,totOut) )
except Exception, e:
print 'cant build node?! ',nid,ndType,len(nbrList)
print e
continue
# horizontally stratify node types
contribLong = 37.
candLong = 44.
commLong = 40.5
# use positive latitudess (:
NMinLat = 92.
SMaxLat = 100.
partyOffset = 0.25
npcand = len(candKeyTbl['P'])
nhcand = len(candKeyTbl['H'])
nscand = len(candKeyTbl['S'])
ncand = npcand+nhcand+nscand
candLatIncr = float(SMaxLat-NMinLat) / ncand
contribLatIncr = float(SMaxLat-NMinLat) / len(conList)
print 'bldGraph: NPres=%d NSenate=%d NHouse=%d LatIncr=%6.3f' % (npcand,nscand,nhcand,candLatIncr)
sortCandKeyTbl = {}
for office in ['P','S','H']:
nidList = candKeyTbl[office]
nidList.sort()
sortCandKeyTbl[office] = nidList
conList.sort(key=(lambda x: x[1]),reverse=True)
conList2 = [nid for nid,totOut in conList]
allCandKeys = sortCandKeyTbl['P']+sortCandKeyTbl['S']+sortCandKeyTbl['H']
nodef = DataDir + 'nodes.csv'
print 'bldGraph: outputting nodes to',nodef
outs = open(nodef,'w')
outs.write('ID,type,LogOut,Label,Attr1,Attr2,Long,Lat\n')
for nid,nd in nodeTbl.items():
if len(nd) < 3:
print 'empty node?!',nid,nd
continue
ndType,nbrList,ndAttrList = nd
(totOut,ndType,label,attr1,attr2) = ndAttrList
# log_10
try:
logOut = math.log(totOut,10)
except Exception,e:
# print 'bad totOut?!',nid,totOut,e,e.args
logOut = 0.
## longitude based on node type
if ndType=='contrib':
# jiggle contributors
long = contribLong + random.uniform(-0.5,0.5)
elif ndType=='cand':
# align DEM/REP
if attr1=='DEM':
long = candLong - partyOffset
elif attr1=='REP':
long = candLong + partyOffset
else:
long = candLong
else:
long = random.uniform(contribLong+2.,candLong-2.)
## latidude reflects candidates' race, or contrib giving rank
if ndType=='cand':
# office = attr2
idx = allCandKeys.index(nid)
lat = NMinLat + idx * candLatIncr
elif ndType=='contrib':
if nid.startswith(SmallGivePrefix):
strata = int(nid[-1])
lat = NMinLat - 1. + 0.1 *strata
else:
idx = conList2.index(nid)
lat = NMinLat + idx * contribLatIncr
else:
lat = random.uniform(NMinLat,SMaxLat)
outs.write('"%s",%s,%6.3f,"%s",%s,%s,%5.2f,%5.2f\n' % (nid,ndType,logOut,label,attr1,attr2,long,lat))
outs.close()
nedge = 0
outs = open(DataDir+'edges.csv','w')
outs.write('Source,Target,Weight,Anti\n')
for nid,nd in nodeTbl.items():
if len(nd) < 3:
print 'empty node?!',nid,nd
continue
ndType,nbrList,ndAttrList = nd
for id2,amt,anti in nbrList:
try:
logAmt = math.log(amt,10)
except Exception,e:
# print 'bad totOut?!',nid,totOut,e,e.args
logOut = 0.
outs.write('"%s","%s",%6.3f,%d\n' % (nid,id2,logAmt,anti))
nedge += 1
outs.close()
print 'bldGraph: NEdge=%d' % (nedge)
## top level routines to call
# reconn('fec')
# parseAll('fec','dormantComm.txt')
# parseAll('fec')
# loadAllTbls()
# analCand()
# bldRaceTbl()
# analComm()
# analContrib()
# analTopContrib(1000)
#comm1f = 'candComm1_uniq.csv'
#topContribf = 'comm_topContrib_1e6.csv'
#topCandf = 'comm_topCand_1e5.csv'
#bldGraph(topContribf,topCandf,comm1f)