In [12]:
import pdfplumber
import pandas as pd
import os
import re
state_names = [
    "ALABAMA", "ALASKA", "ARIZONA", "ARKANSAS", "CALIFORNIA", "COLORADO", "CONNECTICUT", "DELAWARE", "FLORIDA", 
    "GEORGIA", "HAWAII", "IDAHO", "ILLINOIS", "INDIANA", "IOWA", "KANSAS", "KENTUCKY", "LOUISIANA", "MAINE", 
    "MARYLAND", "MASSACHUSETTS", "MICHIGAN", "MINNESOTA", "MISSISSIPPI", "MISSOURI", "MONTANA", "NEBRASKA", 
    "NEVADA", "NEW HAMPSHIRE", "NEW JERSEY", "NEW MEXICO", "NEW YORK", "NORTH CAROLINA", "NORTH DAKOTA", "OHIO", 
    "OKLAHOMA", "OREGON", "PENNSYLVANIA", "RHODE ISLAND", "SOUTH CAROLINA", "SOUTH DAKOTA", "TENNESSEE", "TEXAS", 
    "UTAH", "VERMONT", "VIRGINIA", "WASHINGTON", "WEST VIRGINIA", "WISCONSIN", "WYOMING"
]
def extract_table_from_pdf_plumber(pdf_path):
    # Open the PDF using pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        
        # Find the start and end pages of the table
        start_page = None
        end_page = None
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if "ALABAMA" in text :
                start_page = i
            if "WYOMING" in text:
                end_page = i
                break
        
        # If start or end page is not found, return an empty DataFrame
        if start_page is None or end_page is None:
            return pd.DataFrame()
        
        # Extract text from the identified range of pages
        text = ""
        for page_num in range(start_page, end_page + 1):
            text += pdf.pages[page_num].extract_text()

    # Split the text into lines
    lines = text.split('\n')

    # Initialize lists to store data
    seats = []
    states = []
    names = []
    lq_scores = []

    # Variables to keep track of current state
    current_state = None

    # Iterate through the lines to extract data
    for line in lines:
        print(line)
        if any(state in line for state in state_names):
            current_state = line.replace("(cont.)", "").strip()
        elif any(char.isdigit() for char in line) and "%" in line:
            parts = line.split()
            seats.append(parts[0])
            names.append(' '.join(parts[1:-1]))
            lq_scores.append(parts[-1])
            states.append(current_state)

    # Create a pandas DataFrame
    df = pd.DataFrame({
        'Seat': seats,
        'State': states,
        'Name': names,
        'LQ Score': lq_scores
    })
    #print(lq_scores)
    # Extract the year from the file name
    year = os.path.basename(pdf_path).split('.')[0]
    df['Year'] = year

    # Remove the '%' symbol, fill NaN values with 0, and then convert to integer
    df['LQ Score'] =  df['LQ Score'].str.extract('(\d+)').fillna(0).astype(int)

    # Clean up the 'Name' column by removing any unwanted characters
    df['Name'] = df['Name'].str.replace('[^a-zA-Z\s.]|X', '', regex=True).str.strip()
    df['Name'] = df['Name'].str.replace('A Newsletter for Liberal', '', regex=False).str.strip()

    return df

# Example usage remains the same as in your code.


In [13]:
# Example usage:
base_path = "/Users/sakibanwar/Library/CloudStorage/OneDrive-TheUniversityofWinchester/Learning Metrics/downloaded_pdfs"
pdf_paths = [os.path.join(base_path, f"{year}.pdf") for year in range(2010, 2021)]  # List of paths to your PDF files
dfs = []  # List to store DataFrames for each PDF

for pdf_path in pdf_paths:
    dfs.append(extract_table_from_pdf_plumber(pdf_path))

# Combine all DataFrames into one
final_df = pd.concat(dfs, ignore_index=True)
print(final_df)


111th Congress, 2nd Session ADA’s 2010 Voting Record
ADA C V R 2010
ONGRESSIONAL OTING ECORD
U. S. House of Representatives
Vote Number 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Score
ADA’s Position Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y N Y %
ALABAMA
4 Aderholt (R) - - - - - - - - - - - - - - - - - - - - 0%
6 Bachus, S. (R) - - - - - - - - - - - - - - - - - - - - 0%
1 Bonner (R) - - - - - - - - - - - - - - - - - - - X 0%
2 Bright (D) - - - - + - - - - - - - - + - - - - - X 10%
7 Davis, A. (D) - X X X + + + + - + + - + + + + - - - + 55%
5 Griffi th (R) - - - - - - X - X - X X - - X X - - - X 0%
3 Rogers, Mike (R) - - - - - - - - - - - - - - - - - - + - 5%
ALASKA
AL Young, D. (R) - - - - - - X X X + + - X - - - - - - X 10%
ARIZONA
6 Flake (R) - - - - - - - - - - - - - - - - + - + - 10%
2 Franks, T. (R) - - - - - - - - - - - - - - - - - - + - 5%
8 Giffords (D) + + + - + + + + - + + + + + - + + - - + 75%
7 Grijalva (D) + + + + - + + + + + X + + + + + + + + + 90%
1 Kirkpatrick (D) + 

ADA C V R 2011
ONGRESSIONAL OTING ECORD
U. S. House of Representatives
Vote Number 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 LQ
ADA Position N N N N Y N Y N Y N Y N N N N N N N N N %
ALABAMA
4 Aderholt (R) - - - - X + - - - - - - - - - - + - - - 10%
6 Bachus, S. (R) - - - - - - - - - - - - - - - - - - - - 0%
1 Bonner (R) - - - - - + - - - - - - - - - - - - - - 5%
5 Brooks (R) - - - - - + - - - - - - - - - - - - - - 5%
2 Roby (R) - - - - - - - - - - - - - - - - - - - - 0%
3 Rogers, Mike D. (R) - - - - - + - - - - - - - - - - - - - - 5%
7 Sewell (D) + - + + + + - + + + X + + + + + - + + - 75%
ALASKA
AL Young, D. (R) - + - - + + X - + X - - + - X - - - + - 30%
ARIZONA
6 Flake (R) - - - - - - - - - - - - - - - - - - - - 0%
2 Franks, T. (R) - - - - - - - - - - - - - - - - - - - - 0%
8 Giffords (D) X X X X X X X X X X X X X X X X X X X X NR
1 Gosar (R) - - - - - - - - - - - - - - - - - - - - 0%
7 Grijalva (D) + + + + + + + + + + + + + + + + + + + + 100%
4 Pastor (D) + + + + + + + + 

ADA C V R 2012
ONGRESSIONAL OTING ECORD
U. S. House of Representatives
Vote Number 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 LQ
ADA Position N Y N N N N N N Y N N N N N Y N N N N N %
ALABAMA
4 Aderholt (R) - - - - - - - - - - - - - - - - - - - - 0%
6 Bachus, S. (R) ? - - - - - - - - - - - - - - - - - - - 0%
1 Bonner (R) - - - - - - - - - - - - - ? - - - - ? - 0%
5 Brooks, M. (R) - - - - + - - - - - - - + - - - - - - - 10%
2 Roby (R) - - - - - - - - - - - - - - - - - - - - 0%
3 Rogers, Mike D. (R) - - - - - - - - - - - - - - - - - - - - 0%
7 Sewell (D) + - + + + + + + - + + ? - + ? + + - + + 70%
ALASKA
AL Young, D. (R) - - - - - - - - - - - - - - - - - ? ? - 0%
ARIZONA
8 Barber (D) I I I I I I I I I I - + - + - + + - ? + NR
6 Flake (R) - - - ? - + + - - + - - + - - - - - - - 20%
2 Franks (R) - - - - - + + - - - - - + - - - - - - - 15%
8 Giffords (D) I I I I I I I I I I I I I I I I I I I I NR
1 Gosar (R) - - - ? + + - + - - - - + - - - - - - - 20%
7 Grijalva (D) + + + + + + + + 

ada C v r 2013
ongressional oTing eCord
U. S. House of Representatives
Vote Number 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 LQ
ADA Position N Y N Y Y Y N Y N N N Y Y N N N N N Y Y %
ALABAMA
4 Aderholt (R ) - X - - - - - - - - - - - - - - X - - - 0%
6 Bachus, S. (R ) - - - - - - - - - - - - + - - - - - - - 5%
1 Bonner (R ) - - - X - - X - - - - - - I I I I I I I 0%
5 Brooks, M. (R ) - - - - - - - - - - - - - - - - - - - - 0%
1 Byrne (R ) I I I I I I I I I - I I I I I I I I I I N/A
2 Roby (R ) - - - - - - - - - - - - - - - - - - - - 0%
3 Rogers, Mike D. (R ) - - - - - - - - - - - - - - - - - - - - 0%
7 Sewell (D ) + X + + + - + + - + + - - - + + - + + + 65%
ALASKA
AL Young, D. (R) - - + X - - - + - - - - + - + - - - - - 20%
ARIZONA
2 Barber (D ) - - + - + - + + - - + - - + + - - - + + 45%
8 Franks (R ) - - - - - - - - X - - - - - - - - - - - 0%
4 Gosar (R ) - - - - + - - - X - - - + - - - - X - - 10%
3 Grijalva (D ) + + + + + + + + + + + + + + + + + + + + 100%
1 Kirkpatrick (D 

AdA C V r 2014
ongressionAl oTing eCord
U. S. House of Representatives
Vote Number 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 LQ
ADA Position N N Y N N N Y N Y Y Y N N Y N N N N N N %
ALABAMA
4 Aderholt (R) - - - - - - - - - - - - - - X - - - - - 0%
6 Bachus, S. (R) - - - - - X - - - - - - - - - - - - X - 0%
1 Bonner (R) I I I I I I I I I I I I I I I I I I I I NR
5 Brooks, M. (R) - - - - - - - - - - - - X - - - - - - - 0%
1 Byrne (R) - - - - - - - - - - - - - - - - - - - - 0%
2 Roby (R) - - - - - - - - - - - - - - - - - - - - 0%
3 Rogers, Mike D. (R) - - - - - - - - - - - - - - - - - - - - 0%
7 Sewell (D) + - + + + + + + + - + - + - + - + - + + 70%
ALASKA
AL Young, D. (R) - - - - - - - - - - + - - + - - - - - - 10%
ARIZONA
2 Barber (D) + - + - + - + + - + + - - - - - + - - + 45%
8 Franks (R) - + - - - - - - - - - - - - - - - - - - 5%
4 Gosar (R) - + X X X X X - - - - + - - - - - - - - 10%
3 Grijalva (D) + + + + + + + + + + + + + + + + + + + + 100%
1 Kirkpatrick (D) + - - - + + 

ADA C V r 2015
ongressionAl oTing eCorD
U. S. House of Representatives
Vote Number 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 LQ
ADA Position N N Y N Y N Y N N Y N Y N N N N N N N N %
ALABAMA
4 Aderholt (R) - - - - - - - + - - - - - - - - - - - - 5%
5 Brooks, M. (R) - - - - - - - + - - - - - - - - - - - - 5%
1 Byrne (R) - - - - - - - - - - - - - - - - - - - - 0%
6 Palmer (R) - - - - - - - + - - - - - - - - - - - - 5%
2 Roby (R) - - - - - - - - - - - - - - - - - - - - 0%
3 Rogers, Mike D. (R) - - - - - - - - - - - - - - - - - - - - 0%
7 Sewell (D) + + X + + X + - - + + + + + + + + - - - 65%
ALASKA
AL Young, D. (R) - - - - - - - + - - - - - - - - - - - - 5%
ARIZONA
8 Franks (R) - - - - - - - - - - - - - - - - - - - - 0%
7 Gallego, Ruben (D) + + + + + + + + + + + + + + + + + - + + 95%
4 Gosar (R) X X - X + - X + - - - - - - - - - - - - 10%
3 Grijalva (D) + + + + + + + + + + + + + + + + + + + + 100%
1 Kirkpatrick (D) + + - + + + - + - + + + + - + + + - + + 75%
2 McSally (R) - - - -

ADA c V r 2016
ongressionAl oting ecorD
U. S. House of Representatives
Vote Number 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 LQ
ADA Position N N N N N N N N N Y N Y Y Y N N N N N N %
ALABAMA
4 Aderholt (R) - - - - - - - - - - - - - - - - - - - - 0%
5 Brooks, M. (R) - - - ? - - - + - + - - - - - + - - - - 15%
1 Byrne (R) - - - - - - - - - - - - - - - - - - - - 0%
6 Palmer (R) - - - - - - - - - - - - - - - - - - - - 0%
2 Roby (R) - - - - ? - - - - - - - - - - - - - - - 0%
3 Rogers, Mike D. (R) - + - - ? - - - - - - - - - - - - - - - 5%
7 Sewell (D) + + + + + + + + + - + + + + + + + - + + 90%
ALASKA
AL Young, D. (R) - ? - - - - - - - - ? - - - - - - - - - 0%
ARIZONA
8 Franks (R) - - - - - - - - - - - - - - - + - - - - 5%
7 Gallego, Ruben (D) + + + + + + + + + - + + + + + + + + + + 95%
4 Gosar (R) - - - - - - - - - + - - - - - - - - - - 5%
3 Grijalva (D) + + + + + + + + + + + + + + + + + + ? + 95%
1 Kirkpatrick (D) + + + + ? ? + + + + + + + + + + + - ? ? 75%
2 McSally (R) - - - - 

ADA c V r 2017
ongressionAl oting ecorD
U. S. House of Representatives
Vote Number 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 LQ
ADA Position N N N N N N N N N Y N Y Y Y N N N N N N %
ALABAMA
4 Aderholt(R) - - - - - - - - - - - - - - - - - - - - 0%
5 Brooks, M. (R) - - - - ? - - - + - - - - - - - - - - - 5%
1 Byrne (R) - - - - - - - - - - - - - - - - - - - - 0%
6 Palmer (R) - - - - - - - - - - - - - - - - - - - - 0%
2 Roby (R) - - - - ? - - - - - - - - - - - - - - - 0%
3 Rogers, Mike D. (R) - + - - ? - - - - - - - - - - - - - - - 5%
7 Sewell (D) + + - + - - + + + - + + + + + + + + + + 85%
ALASKA
AL Young, D. (R) - - - - - - - - - - - - - - - - - - - - 0%
ARIZONA
8 Franks (R) | - - - - - - - - - - - - - - - - - - - 0%
7 Gallego, Ruben (D) + + + + + + + + + - + + + + + + + + + + 95%
4 Gosar (R) - - - - - - - - + ? - - - ? - - - - - - 5%
3 Grijalva (D) + + + + + + + + + + + + + + + + + + + + 100%
1 Kirkpatrick (D) + + + + ? ? + + + + + + + + + + + - ? ? 75%
2 McSally (R) - - - - -

ADA c V r 2018
ongressionAl oting ecorD
U. S. House of Representatives
Vote Number 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 LQ
ADA Position N N N N N N N N Y N N N N N N Y N N N N %
ALABAMA
1 Byrne - - - - - - - - - - - - - - - - - - - - 0%
2 Roby - - - - - - - - - - - - - - - - - - - - 0%
3 Rogers M. - - - - - - - - - - - - - - - - - - - - 0%
4 Aderholt - - - - - - X X - - - - - - - - - - - - 0%
5 Brooks M. - - - - - - - - - + + - - - - - - - - + 15%
6 Palmer - - - - - - - - - - - - - - - - - - - - 0%
7 Sewell + - + + + - - + + - + - - - + + - + - + 55%
ALASKA
AL Young, D. (R) - - - - - - - - - - - - - - - + - - - - 5%
ARIZONA
1 O'Halleran + - - + + - - + + - - - - - + + - + - - 40%
2 McSally - - - - - - - - - - - - - - - - - - - - 0%
3 Grijalva + + + + + + + + + + + + + + + + + + + + 100%
4 Gosar - - - - - - - - + + X - - - - - - - - - 10%
5 Biggs - - - - - - - + + + + - - - - - - - - + 25%
6 Schweikert - - - - - - - - - - - - - - - - - - - - 0%
7 Gallego + + + + + + - + + 

ADA c V r 2019
ongressionAl oting ecorD
U. S. House of Representatives
Vote Number 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 LQ
ADA Position N Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y %
ALABAMA
1 Byrne - - - X - - - - - - - - - - - - - - - - 0%
2 Roby - - + - - - - - - - - - - - - - - - - - 5%
3 Rogers M. - - + - + - - - - - - - X - - - - X - - 10%
4 Aderholt - - X - + - - - - - - - - - - - - - - - 5%
5 Brooks M. - - - - - - - - - - - - - - - - - - - - 0%
6 Palmer - - - - - - - - - - - - - - - - - - - - 0%
7 Sewell - + + + + + + + + - + - + + + + + + + + 85%
ALASKA
AL Young, D. (R) - - + - - - - - - - - - X - X + - - - - 10%
ARIZONA
1 O'Halleran - + + + + + + + + - - - + + + + + + + + 80%
2 Kirkpatrick - + + + + + + + + + - - + + + + + + + + 85%
3 Grijalva - + + + + + + + + + + + + + + + + + + + 95%
4 Gosar - X - X - - - - - - - - - - - - - - - - 0%
5 Biggs - - - - - - - - - - - - - - - - - - - - 0%
6 Schweikert - - - - - - - - - - - - - - - - - - - - 0%
7 Gallego - + + + + + + +

     Seat      State               Name  LQ Score  Year
0       4    ALABAMA         Aderholt R         0  2010
1       6    ALABAMA        Bachus S. R         0  2010
2       1    ALABAMA           Bonner R         0  2010
3       2    ALABAMA           Bright D        10  2010
4       7    ALABAMA         Davis A. D        55  2010
...   ...        ...                ...       ...   ...
4336    5  WISCONSIN      Sensenbrenner         0  2019
4337    6  WISCONSIN           Grothman         0  2019
4338    7  WISCONSIN  Duffy I I I I I I         0  2019
4339    8  WISCONSIN          Gallagher        10  2019
4340   AL    WYOMING             Cheney         5  2019

[4341 rows x 5 columns]


Unnamed: 0,Seat,State,Name,LQ Score,Year
960,15,FLORIDA,Posey R,15,2012
961,25,FLORIDA,Rivera R,5,2012
962,16,FLORIDA,Rooney R,0,2012
963,18,FLORIDA,RosLehtinen R,5,2012
964,12,FLORIDA,Ross R,5,2012
965,2,FLORIDA,Southerland R,5,2012
966,6,FLORIDA,Stearns R,10,2012
967,20,FLORIDA,Wasserman Shultz D,90,2012
968,8,FLORIDA,Webster R,5,2012
969,22,FLORIDA,West A. R,0,2012


In [14]:
final_df.to_csv('new_file.csv')

In [14]:
final_df

Unnamed: 0,Seat,State,Name,LQ Score,Year
0,4,ALABAMA,Aderholt R,0,2010
1,6,ALABAMA,Bachus S. R,0,2010
2,1,ALABAMA,Bonner R,0,2010
3,2,ALABAMA,Bright D,10,2010
4,7,ALABAMA,Davis A. D,55,2010
...,...,...,...,...,...
4336,5,WISCONSIN,Sensenbrenner,0,2019
4337,6,WISCONSIN,Grothman,0,2019
4338,7,WISCONSIN,Duffy I I I I I I,0,2019
4339,8,WISCONSIN,Gallagher,10,2019


In [None]:
# Sample string
s2 = "6 Roskam (R) - - + - - - - - - - - - + - - - - - - - 10% 1 Rush (D) + X + + + + + + + + + + X X + + + + + + 85%"

# Splitting the string by % and then appending the % to each split part except the last one
split_parts = s2.split('%')
formatted_splits = [part.strip() + '%' for part in split_parts[:-1]] + [split_parts[-1].strip()]

# Removing any empty or whitespace-only strings from the result
formatted_splits = [part for part in formatted_splits if part.strip()]

# Printing the result
print(formatted_splits)
