We then parse the content of each file extracting the following information:

1. Name, e.g. RICHARD SHELBY
2. Party, e.g., Republican
3. location, e.g., Tuscaloosa, AL
4. born_in, e.g, Birmingham, AL 
5. birthdate, e.g., May 6, 1934
6. education, e.g., "attended the public schools;  B.A., University of Alabama, 1957; LL.B., University of Alabama School of Law, 1963;"
7. professional, e.g., "attorney; admitted to the Alabama Bar in 1961 and commenced practice in  Tuscaloosa; member, Alabama State Senate, 1970-78; law clerk, Supreme Court of Alabama, 1961-62; city prosecutor, Tuscaloosa, 1963-71; U.S. Magistrate, Northern District of Alabama, 1966-70; Special Assistant  Attorney General, State of Alabama, 1969-71; chairman, Legislative 
Council of the Alabama Legislature, 1977-78; former president, 
Tuscaloosa County Mental Health Association; member, Alabama Code 
Revision Committee, 1971-75; member, Phi Alpha Delta legal fraternity, 
Tuscaloosa County; Alabama and American Bar Associations; First 
Presbyterian Church of Tuscaloosa; Exchange Club; American Judicature 
Society; Alabama Law Institute;"
8. married, e.g., "the former Annette Nevin in 1960"
9. children, e.g., "Richard C., Jr., and Claude Nevin"
10. committees, e.g., "chair, Appropriations; Banking, Housing, and Urban Affairs; Environment and  Public Works; Joint Committee on the Library; Rules and Administration;  elected to the 96th Congress on November 7, 1978; reelected to the three  succeeding Congresses; elected to the U.S. Senate on November 4, 1986; reelected to each succeeding Senate term."
11. URL, e.g., http://jones.senate.gov etc.
12. Filename, e.g., CDIR-2018-10-29-STATISTICALINFORMATION-2.htm

In [1]:
import re
import calendar
import requests
import pandas as pd
from glob import glob
import re

In [2]:
data = []
for fn in glob('../data/text/*.txt'):
    m = re.match('.*?(\d+)\.txt', fn)
    if m:
        cong = int(m.group(1))
    data.append({'congress': cong, 'textfile': fn})
data

[{'congress': 102, 'textfile': '../data/text/102.txt'},
 {'congress': 98, 'textfile': '../data/text/98.txt'},
 {'congress': 103, 'textfile': '../data/text/103.txt'},
 {'congress': 104, 'textfile': '../data/text/104.txt'},
 {'congress': 97, 'textfile': '../data/text/97.txt'},
 {'congress': 100, 'textfile': '../data/text/100.txt'},
 {'congress': 101, 'textfile': '../data/text/101.txt'},
 {'congress': 99, 'textfile': '../data/text/99.txt'}]

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,congress,textfile
0,102,../data/text/102.txt
1,98,../data/text/98.txt
2,103,../data/text/103.txt
3,104,../data/text/104.txt
4,97,../data/text/97.txt
5,100,../data/text/100.txt
6,101,../data/text/101.txt
7,99,../data/text/99.txt


In [4]:
df.sort_values('congress', inplace=True)
df

Unnamed: 0,congress,textfile
4,97,../data/text/97.txt
1,98,../data/text/98.txt
7,99,../data/text/99.txt
5,100,../data/text/100.txt
6,101,../data/text/101.txt
0,102,../data/text/102.txt
2,103,../data/text/103.txt
3,104,../data/text/104.txt


In [5]:
total = 0
bios = []
for i, r in df.iterrows():
    with open(r.textfile, encoding='latin-1') as f:
        text = f.read()
    count = 0
    for m in re.finditer('^(\s+[A-Z\s\,\.]{4,}.*)$', text, flags=re.M):
        t = m.group(1)
        for l in t.split('\n'):
            if len(l) > 50:
                if l.startswith('  ') and l[2] != ' ':
                    bios.append({'congress': r.congress, 'biography': l.strip()})
                    #print(l)
                    #print('-'*50)
                    count += 1
    print(r.congress, count)
    total += count
total

97 526
98 523
99 530
100 535
101 538
102 531
103 531
104 453


4167

In [6]:
bdf = pd.DataFrame(bios)
bdf

Unnamed: 0,congress,biography
0,97,"GEORGE HERBERT WALKER BUSH, Republican, of Hou..."
1,97,"HOWELL THOMAS HEFLIN, Democrat, of Tuscumbia, ..."
2,97,"JACK EDWARDS, Republican, of Mobile, Ala.; bor..."
3,97,"WILLIAM WUIS DICKINSON, Republican, of Montgom..."
4,97,"WILLIAM NICHOLS, Democrat, of Sylacauga, Ala.;..."
...,...,...
4162,104,"ENI F.H. FALEOMAVAEGA, Democrat, of Vailoatai ..."
4163,104,"ELEANOR HOLMES NORTON, Democrat, of Washington..."
4164,104,"ROBERT A. UNDERWOOD, Democrat, of Vona, GU; bo..."
4165,104,"CARLOS ROMERO-BARCELO, Democrat, of San Ju..."


In [7]:
df = df.join(bdf.set_index('congress'), on='congress')
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,congress,textfile,biography
0,97,../data/text/97.txt,"GEORGE HERBERT WALKER BUSH, Republican, of Hou..."
1,97,../data/text/97.txt,"HOWELL THOMAS HEFLIN, Democrat, of Tuscumbia, ..."
2,97,../data/text/97.txt,"JACK EDWARDS, Republican, of Mobile, Ala.; bor..."
3,97,../data/text/97.txt,"WILLIAM WUIS DICKINSON, Republican, of Montgom..."
4,97,../data/text/97.txt,"WILLIAM NICHOLS, Democrat, of Sylacauga, Ala.;..."
...,...,...,...
4162,104,../data/text/104.txt,"ENI F.H. FALEOMAVAEGA, Democrat, of Vailoatai ..."
4163,104,../data/text/104.txt,"ELEANOR HOLMES NORTON, Democrat, of Washington..."
4164,104,../data/text/104.txt,"ROBERT A. UNDERWOOD, Democrat, of Vona, GU; bo..."
4165,104,../data/text/104.txt,"CARLOS ROMERO-BARCELO, Democrat, of San Ju..."


In [8]:
def extract_born_in(txt):
    months = [calendar.month_name[i] for i in range(1, 13)]
    txt = txt.strip().replace('in ', '')
    m = re.match('.*?((?:%s)\s+\d{,2},?\s+\d{,4})' % '|'.join(months), txt)
    if m:
        c = m.group(1)
        b = txt.replace(c, '').strip(', ')
    else:
        a = re.split('(%s)' % '|'.join(months), txt)
        b = a[0].strip(', ')
        if len(a) > 2:
            c = ''.join(a[-2:])
        else:
            c = None
    return b, c

In [9]:
def extract_key_value(key, b, idx):
    val_start = (idx + 1)
    try:
        val_end = idx + b[idx + 1:].index(':')
    except:
        val_end = len(b)
    value = ''.join(b[val_start:val_end])
    # FIXME:
    value = value.split('elected')[0]
    #print(' -', key.upper(), value)
    return value

In [10]:
def get_married_to(value):
    m = re.match('(?:married to\s|wife[\-,]|husband,?|spouse,|married,?)(.*?)[;,]', value)
    if m:
        married = m.group(1)
        if re.search('child', married):
            #print(married)
            return None
        else:
            #print(married)
            return married
    else:
        #print(value)
        return None

In [11]:
bios = []
keys = dict()
for i, row in df.iterrows():
    data = {}
    if row.biography:
        bio = row.biography.replace('\n', '').strip()
        m = re.match('(.*?(?:, Jr\.|, DVM|, P\.E\.|, IV|, D\.V\.M\.|, M\.D\.)?)[;,]\s+(.*?(?:Democrat|Republican|Independent|Party).*?)[;,]?(?:\sof\s)?.*', bio)
        if m:
            data['name'] = m.group(1)
            data['party'] = re.split(',|;', m.group(2))[-1]
            pass
        else:
            print('WARN: Cannot parse name and party:', i, bio[:60])
            pass
        m = re.match('.*?(?:.*?(?:Democrat|Republican|Independent|Party).*?[;, ])?(?:\sof\s)?(.*?)(?:born|education|graduate|professional|attorney|b\.).*', bio, re.I)
        if m:
            data['location'] = m.group(1).replace(';', '')
            pass
        else:
            print('WARN: Cannot parse location:', i, bio[:60])
            pass
        m = re.match('.*?\s+born(.*?)(?:;|,\s+education)', bio, re.I)
        if m:
            # parse born_in and birthdate
            born_in, birthdate = extract_born_in(m.group(1))
            data['born_in'] = born_in
            data['birthdate'] = birthdate
            pass
        else:
            print('WARN: Cannot parse born:', i, bio[20:100])
            pass
        """
        education, e.g., "attended the public schools; B.A., University of Alabama, 1957; LL.B., University of Alabama School of Law, 1963;"
        professional, e.g., "attorney; admitted to the Alabama Bar in 1961 and commenced practice in Tuscaloosa; member, Alabama State Senate, 1970-78; law clerk, Supreme Court of Alabama, 1961-62; city prosecutor, Tuscaloosa, 1963-71; U.S. Magistrate, Northern District of Alabama, 1966-70; Special Assistant Attorney General, State of Alabama, 1969-71; chairman, Legislative Council of the Alabama Legislature, 1977-78; former president, Tuscaloosa County Mental Health Association; member, Alabama Code Revision Committee, 1971-75; member, Phi Alpha Delta legal fraternity, Tuscaloosa County; Alabama and American Bar Associations; First Presbyterian Church of Tuscaloosa; Exchange Club; American Judicature Society; Alabama Law Institute;"
        married, e.g., "the former Annette Nevin in 1960"
        children, e.g., "Richard C., Jr., and Claude Nevin"
        committees, e.g., "chair, Appropriations; Banking, Housing, and Urban Affairs; Environment and Public Works; Joint Committee on the Library; Rules and Administration; elected to the 96th Congress on November 7, 1978; reelected to the three succeeding Congresses; elected to the U.S. Senate on November 4, 1986; reelected to each succeeding Senate term."
        URL, e.g., http://jones.senate.gov etc.
        """
        a = re.split('(\. |,|;|:)', bio)
        b = [c.strip() for c in a]
        #print(b)
        for idx in [j for j, x in enumerate(b) if x == ":"]:
            key = b[idx - 1]
            if key not in keys:
                keys[key] = 1
            else:
                keys[key] += 1
            value = extract_key_value(key, b, idx)
            if key in ['education']:
                data[key] = value
            elif re.search('profession', key):
                data['professional'] = value
            elif key in ['married']:
                data[key] = value
            elif re.search('^child|\schild|son|daughter', key):
                data['children'] = value
            elif re.search('committee', key):
                data['committees'] = value
            elif re.search('family', key):
                data['married'] = get_married_to(value)
    #print('-'* 50)
    bios.append(data)
    #if i > 50:
    #    break

WARN: Cannot parse name and party: 17 ELDON RUDD, Re_P-11blican, of Scottsdale, Ariz.; born in Cam
WARN: Cannot parse name and party: 41 CHARLES PASHAYAN, Ja., Re blican, of Fresno, Calif.; born in
WARN: Cannot parse name and party: 129 HENRY J. HYDE, RepubliC8;fi, of Bensenville, Ill.; born in C
WARN: Cannot parse location: 131 DAN ROSTENKOWSKI, Democrat, of Chicago,  Ill.; educated  in 
WARN: Cannot parse born: 131 mocrat, of Chicago,  Ill.; educated  in  St.  John's  Mili­ tary Academy and Loy
WARN: Cannot parse born: 170 UM, Republican, of Wichita, Kans.; bom in Topeka, Kans., July 29, 1932, the daug
WARN: Cannot parse location: 210 EDWARD P. BOLAND, Democrat, of Springfield, Mass.; elected t
WARN: Cannot parse born: 210 mocrat, of Springfield, Mass.; elected to 83d and ree­ lected to 84th, 85th, 86t
WARN: Cannot parse name and party: 251 THAD COCHRAN,  Reeublican, of Jackson, Miss.; born in Pontot
WARN: Cannot parse name and party: 364 BOB McEWEN, ReJ?ublican, of Hillsboro, Ohio; 

WARN: Cannot parse location: 2794 DAN ROSTENKOWSKI, Democrat, of Chicago, IL; educated in St. 
WARN: Cannot parse born: 2794 mocrat, of Chicago, IL; educated in St. John's Military Academy and Loyola Unive
WARN: Cannot parse name and party: 2885 FREDERICK. S. UPTON, llepublican, of St. Joseph, MI; born in
WARN: Cannot parse born: 3062 crat, of Johnstown, PA; graduated, Ramsey High School, Mount Pleasant, PA; Kiski
WARN: Cannot parse born: 3106 an, of Ennis, TX; bom in Waco, TX, tember 15, 1949; attended Travis Elementary S
WARN: Cannot parse name and party: 3153 SID MORRISON, Re_publican, of Zillah, WA; born in Yakima, WA
WARN: Cannot parse born: 3178 A, Democrat, of Pago Pago, AS, graduate of Kahuku High School, Hawaii, 1962; B.A
WARN: Cannot parse name and party: 3243 Los ANGELES CoUNTY; cities of Culver  City, ,..,. Angeles Ci
WARN: Cannot parse location: 3243 Los ANGELES CoUNTY; cities of Culver  City, ,..,. Angeles Ci
WARN: Cannot parse born: 3243 cities of Culver  City, ,..,. Ang

In [12]:
bdf = pd.DataFrame(bios)
bdf

Unnamed: 0,name,party,location,born_in,birthdate,children,professional,committees,education,married
0,GEORGE HERBERT WALKER BUSH,Republican,"Houston, Tex. 43d Vice President of the Uni...","Milton, Mass.","June 12, 1924","George,Jeb,Neil,Marvin,and Dorothy;",,,,
1,HOWELL THOMAS HEFLIN,Democrat,"Tuscumbia, Ala.",_21 son of Reverend MarvR. Hefl and Louise D...,"June 19, 19",,,,,
2,JACK EDWARDS,Republican,"Mobile, Ala.","Birmingham, Ala., Sep­ tember 20, 1928",,,,,,
3,WILLIAM WUIS DICKINSON,Republican,"Montgomery, Ala.","Ope­ lika, Ala.","June 5, 1925",,,,,
4,WILLIAM NICHOLS,Democrat,"Sylacauga, Ala.","on a small farm near Becker, Miss.","October 16, 1918","M_emorie,Margaret,and Flynt;",,,,
...,...,...,...,...,...,...,...,...,...,...
4162,ENI F.H. FALEOMAVAEGA,Democrat,"Vailoatai Pago Pago, AS,",,,,,,,
4163,ELEANOR HOLMES NORTON,Democrat,"Washington, DC bom in Wuhington. DC, June 13, ...",,,,,,,
4164,ROBERT A. UNDERWOOD,Democrat,"Vona, GU","Tamuning, Guam","July 13, 1948","Sophia,Ro­",,,,
4165,CARLOS ROMERO-BARCELO,Democrat,"of San Juan, Pueno Rico","on San Juan, Puerto Rico","September 4, 1932","Car­ los,Andres,Juan Carlos,and Melinda;",,,,


In [13]:
bdf.party.unique()

array(['Republican', 'Democrat', ' Republican', nan, ' Democrat',
       'Independent', 'Conservative-Republican', ' Independent',
       ' elected Republican', '  Democrat',
       ' is chairman of the Democrat', ' Assistant Senate Republican',
       ' National Republican', '·Democrat', ' California Republican',
       '  Republican', ' ranking Republican',
       ' member: Nevada Federation of Republican',
       ' chairman of the National Republican',
       ' Outagamie County Young Republican'], dtype=object)

In [14]:
df[df.index.isin(bdf[bdf.party.isnull()].index)]

Unnamed: 0,congress,textfile,biography
17,97,../data/text/97.txt,"ELDON RUDD, Re_P-11blican, of Scottsdale, Ariz..."
41,97,../data/text/97.txt,"CHARLES PASHAYAN, Ja., Re blican, of Fresno, C..."
129,97,../data/text/97.txt,"HENRY J. HYDE, RepubliC8;fi, of Bensenville, I..."
251,97,../data/text/97.txt,"THAD COCHRAN, Reeublican, of Jackson, Miss.; ..."
364,97,../data/text/97.txt,"BOB McEWEN, ReJ?ublican, of Hillsboro, Ohio; b..."
...,...,...,...
4110,104,../data/text/104.txt,"LARRY COMBEST. Republican, of Lubbock, TX; bor..."
4120,104,../data/text/104.txt,"ORRIN G. HATCH. Republican, of Salt Lake City,..."
4125,104,../data/text/104.txt,"JOHN W. WARNER. Republican, of Alexandria, VA;..."
4142,104,../data/text/104.txt,"NORMAN D. DICKS , Dcmocnit. of Bremerton, WA; ..."


In [15]:
import operator

for k in sorted(keys.items(), key=operator.itemgetter(1), reverse=True):
    print(k)

('member', 2129)
('three children', 561)
('two children', 558)
('four children', 286)
('five children', 154)
('awards', 101)
('two daughters', 80)
('children', 57)
('former member', 54)
('chairman', 53)
('committee assignments', 49)
('two sons', 44)
('elected', 43)
('committees', 41)
('six children', 40)
('three daughters', 38)
('honorary degrees', 35)
('', 34)
('three sons', 34)
('one child', 33)
('author', 27)
('recipient', 25)
('three chil\xad dren', 24)
('board member', 23)
('two chil\xad dren', 23)
('past president', 22)
('seven children', 21)
('attended', 18)
('trustee', 15)
('two  children', 15)
('honors', 15)
('profession', 14)
('subcommittees', 14)
('board of directors', 13)
('eight children', 13)
('four chil\xad dren', 13)
('married', 13)
('Senate committees', 12)
('honors and awards', 12)
('president', 11)
('served', 11)
('served as', 10)
('one daughter', 10)
('served on', 9)
('director', 9)
('six grandchildren', 9)
('military awards', 9)
('four sons', 9)
('committee member'

In [16]:
cdf = df.join(bdf)
cdf

Unnamed: 0,congress,textfile,biography,name,party,location,born_in,birthdate,children,professional,committees,education,married
0,97,../data/text/97.txt,"GEORGE HERBERT WALKER BUSH, Republican, of Hou...",GEORGE HERBERT WALKER BUSH,Republican,"Houston, Tex. 43d Vice President of the Uni...","Milton, Mass.","June 12, 1924","George,Jeb,Neil,Marvin,and Dorothy;",,,,
1,97,../data/text/97.txt,"HOWELL THOMAS HEFLIN, Democrat, of Tuscumbia, ...",HOWELL THOMAS HEFLIN,Democrat,"Tuscumbia, Ala.",_21 son of Reverend MarvR. Hefl and Louise D...,"June 19, 19",,,,,
2,97,../data/text/97.txt,"JACK EDWARDS, Republican, of Mobile, Ala.; bor...",JACK EDWARDS,Republican,"Mobile, Ala.","Birmingham, Ala., Sep­ tember 20, 1928",,,,,,
3,97,../data/text/97.txt,"WILLIAM WUIS DICKINSON, Republican, of Montgom...",WILLIAM WUIS DICKINSON,Republican,"Montgomery, Ala.","Ope­ lika, Ala.","June 5, 1925",,,,,
4,97,../data/text/97.txt,"WILLIAM NICHOLS, Democrat, of Sylacauga, Ala.;...",WILLIAM NICHOLS,Democrat,"Sylacauga, Ala.","on a small farm near Becker, Miss.","October 16, 1918","M_emorie,Margaret,and Flynt;",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4162,104,../data/text/104.txt,"ENI F.H. FALEOMAVAEGA, Democrat, of Vailoatai ...",ENI F.H. FALEOMAVAEGA,Democrat,"Vailoatai Pago Pago, AS,",,,,,,,
4163,104,../data/text/104.txt,"ELEANOR HOLMES NORTON, Democrat, of Washington...",ELEANOR HOLMES NORTON,Democrat,"Washington, DC bom in Wuhington. DC, June 13, ...",,,,,,,
4164,104,../data/text/104.txt,"ROBERT A. UNDERWOOD, Democrat, of Vona, GU; bo...",ROBERT A. UNDERWOOD,Democrat,"Vona, GU","Tamuning, Guam","July 13, 1948","Sophia,Ro­",,,,
4165,104,../data/text/104.txt,"CARLOS ROMERO-BARCELO, Democrat, of San Ju...",CARLOS ROMERO-BARCELO,Democrat,"of San Juan, Pueno Rico","on San Juan, Puerto Rico","September 4, 1932","Car­ los,Andres,Juan Carlos,and Melinda;",,,,


In [17]:
cdf.to_csv('../data/biocong-text-all-parsed.csv', index=False)