In [1]:
import os
import pandas as pd
import numpy as np

## Data Cleaning
- Some of the total votes counts vary for different congressional sessions. At most, this occurred in 9% of the data for each session, all the average was moreso about 5-6% of the data being corrupted. As such, I decided to drop the observations that didn't have the correct data.
  - For the house: I deleted congressional sessions 98, 99, 100, and 101. They had no vote data.
  - For the senate: I deleted congressional sessions 80-100. They had not vote data.


In [2]:
house_fpath = './congress-data/house/'
house_sessions = os.listdir(house_fpath)

In [3]:
for sesh in house_sessions:
    fpath = './congress-data/house/%s' % (sesh)
    df = pd.read_csv(fpath, index_col=0)
    
    df['leadership_role'] = df['leadership_role'].replace({np.nan : False})
    
    total_vote_cts = df['total_votes'].value_counts()
    if len(total_vote_cts) == 0:
        print('Can delete: %s' % (sesh))
    else:
        most_common = total_vote_cts.index[0]
        indices = df[df['total_votes'] != most_common].index
        df = df.drop(indices)
        
    df.to_csv(fpath)

Can delete: house_99.csv
Can delete: house_98.csv
Can delete: house_101.csv
Can delete: house_100.csv


In [5]:
senate_fpath = './congress-data/senate/'
senate_sessions = os.listdir(senate_fpath)

In [6]:
for sesh in senate_sessions:
    fpath = './congress-data/senate/%s' % (sesh)
    df = pd.read_csv(fpath, index_col=0)
    
    df['leadership_role'] = df['leadership_role'].replace({np.nan : False})
    
    total_vote_cts = df['total_votes'].value_counts()
    if len(total_vote_cts) == 0:
        print('Can delete: %s' % (sesh))
    else:
        most_common = total_vote_cts.index[0]
        indices = df[df['total_votes'] != most_common].index
        df = df.drop(indices)
        
    df.to_csv(fpath)

Can delete: senate_82.csv
Can delete: senate_96.csv
Can delete: senate_97.csv
Can delete: senate_83.csv
Can delete: senate_95.csv
Can delete: senate_81.csv
Can delete: senate_80.csv
Can delete: senate_94.csv
Can delete: senate_90.csv
Can delete: senate_84.csv
Can delete: senate_85.csv
Can delete: senate_91.csv
Can delete: senate_100.csv
Can delete: senate_87.csv
Can delete: senate_93.csv
Can delete: senate_92.csv
Can delete: senate_86.csv
Can delete: senate_88.csv
Can delete: senate_89.csv
Can delete: senate_99.csv
Can delete: senate_98.csv
