# <center> Federal Reserve Attendance  </center>
### Goal:
1. Survival Analysis of present economist at the Federal Reserve Open Market Committe Meetings 
2. Text analysis of meeting minutes

In [1]:
import pandas as pd
from bs4 import BeautifulSoup as BS
from bs4 import element
import spacy
import itertools

file_dir = r"C:\Users\Richy\Documents\Python Scripts\fed_proj\fed_min_project\fed_min_project\spiders"
file_name = file_dir + r"\fed_mins.csv"

df = pd.read_csv(file_name, names = ['date', 'html'])

# CSV's first row is empty 
df = df.drop(0)

# Weird fromating but some rows have '–' instead of '-',
# affects the pd.to_datetime()
df['date'] = df['date'].str.replace('–', '-')

In [2]:
# make function to get rid of second day
# i.e. March 20-21, 2018 -> March 20, 2018
def date_simp(date):
    FirstKey = date.find('-')
    SecondKey = date.find(',')
    return pd.to_datetime(date.replace(date[FirstKey:SecondKey], ""))

df['date'] = df.date.apply(date_simp)

# Sort df by date
df = df.sort_values(by = ['date'])

In [3]:
#make the the html column into BeautifulSoup Objects
df['html'] = df['html'].apply(lambda x: BS(x, 'html.parser'))

In [4]:
# Test to see if BS objects are consistent
for i in range(5): #range(len(df))
    print("FOMC notes from the meeting on {}".format(df.iloc[i,1].contents[0].text))

FOMC notes from the meeting on January 28-29, 2014
FOMC notes from the meeting on March 18-19, 2014
FOMC notes from the meeting on April 29-30, 2014
FOMC notes from the meeting on June 17-18, 2014
FOMC notes from the meeting on July 29-30, 2014


### <center> Roll Call </center>
#### Subsetting html for the names inbetween the Present List

In [5]:
def present_list(names):
    '''
    Present_list: function that returns the list of all present economist for a given FOMC meetings
    Input: Original HTML
    Output: List of names. *Technically a set (; #PythonJokes
    '''
    
    #Initialize a counting of strong tags. After 2 we know we have what we want 
    counter = 0
    nlp = spacy.load('en_core_web_sm') #, tagger = False, parser = False, matcher = False)
    
    # Subset HTML to only present names #
    # Iterate over children.
    for i in names.findChildren():
        if i.name == "strong":
            counter += 1
            if counter == 2:
                i.parent.decompose()  # Remove the second Strong tag's parent.
        if counter > 1:  # Remove all tags after second Strong tag.
            if isinstance(i, element.Tag):
                i.decompose()
                
    #names_str = str(names)
    doc_spacy = nlp(names.text)

    names_list = []
    for ent in doc_spacy.ents:
        if ent.label_ == 'PERSON':
            names_list.append(ent.text)

    # Remove all charachter after ","
    names_list = [name.split(",", 1)[0] for name in names_list]
    # Remove all digits from names
    names_list = [i for i in names_list if not any(c.isdigit() for c in i)]
    names_list = list(set(names_list))
    return names_list

In [6]:
# Apply function to html column to make a new column
df['present'] = df['html'].apply(present_list)
# takes between 12 ~ 30 seconds to run

In [7]:
#df.head()

In [8]:
# Date Cleaning: Some of the names below cause problems with the dashboard so we take of them now

for i in range(len(df)):
    for name in range(len(df.iloc[i, -1])):
        if df.iloc[i, -1][name] == 'Don Kim':
            df.iloc[i, -1][name] = 'Don H. Kim'            
        elif df.iloc[i, -1][name] == 'James M.':
            df.iloc[i, -1][name] = 'James M. Trevino'            
        elif df.iloc[i, -1][name] == 'Gretchen C.':
            df.iloc[i, -1][name] = 'Gretchen C. Weinbach'
        elif df.iloc[i, -1][name] == 'Òscar Jordà':
            df.iloc[i, -1][name] = 'Oscar Jorda'
        elif df.iloc[i, -1][name] == 'Egon Zakrajšek':
            df.iloc[i, -1][name] = 'Egon Zakrajsek'
        elif df.iloc[i, -1][name] == 'David López-Salido':
            df.iloc[i, -1][name] = 'David Lopez-Salido'
        elif df.iloc[i, -1][name] == 'Mark E. Van Der':
            df.iloc[i, -1][name] = 'Mark E. Van Der Weide'
        elif df.iloc[i, -1][name] == 'Janet L. Yellen Christine Cumming':
            df.iloc[i, -1][name] = 'Christine Cumming'
            df.iloc[i, -1].append('Janet L. Yellen')
        elif df.iloc[i, -1][name] == 'Counsel Thomas C. Baxter':
            df.iloc[i, -1][name] = 'Thomas C. Baxter'
        elif df.iloc[i, -1][name] == 'Daniel K. Tarullo Christine Cumming':
            df.iloc[i, -1][name] = 'Christine Cumming'
            df.iloc[i, -1].append('Daniel K. Tarullo')
        elif df.iloc[i, -1][name] == 'Daniel K. Tarullo Marie Gooding':
            df.iloc[i, -1][name] = 'Daniel K. Tarullo'
            df.iloc[i, -1].append('Marie Gooding')
        elif df.iloc[i, -1][name] == 'Jerome H. Powell Marie Gooding':
            df.iloc[i, -1][name] = 'Jerome H. Powell'
            df.iloc[i, -1].append('Marie Gooding')
        elif df.iloc[i, -1][name] == 'Jane E.':
            df.iloc[i, -1][name] = 'Jane E. Ihrig'

In [9]:
df['present'] = df['present'].apply(set, list)

#### Create a running list of all names present on list

In [10]:
# Make list of all present guest
complete_names = []

for i in range(len(df)):
    remake = list(df.present.iloc[i])
    complete_names.append(remake)
    
complete_names = list(itertools.chain.from_iterable(complete_names))

#remove digits from names
complete_names = [i for i in complete_names if not any(c.isdigit() for c in i)]

In [11]:
# Find only unique names
complete_names = list(set(complete_names))

print("Before removing redundant names there are {} unique names".format(len(complete_names)))
#complete_names

Before removing redundant names there are 331 unique names


In [12]:
# Make a list of names we dont want in our list
bad_words = ['PRESENT', 'Jr.', 'Mester', 'Fischer', 'Susan', 'Linda',  'James M.'
            'Jane E.', 'E. Dunn', 'Jeffrey D.', 'Frierson', 'K. Logan', 'Fabio M.',
            'Chair', 'Chair\n', 'Williams', 'Robert J.', 'Powell', 'Jerome H. Powell Marie Gooding',
            'Daniel K. Tarullo Marie Gooding']

In [13]:
# List comprehension to remove redundant names
complete_names = [word for word in complete_names if word not in bad_words]

 #make the comoplete names list alphabetical
complete_names.sort()
    
print("After removing redundant names there are {} unique names".format(len(complete_names)))

After removing redundant names there are 315 unique names


In [14]:
# Remove bad words from the df['present'] column
print('There are {} unique names from the {} meeting'.format(len(df.iloc[0, -1]), df.iloc[0,0]))


df['present'] = df['present'].apply(lambda name: [word for word in name if word not in bad_words])


print('After removing bad words there are {} unique names from the {} meeting'.format(len(df.iloc[0, -1]), df.iloc[0,0]))

There are 64 unique names from the 2014-01-28 00:00:00 meeting
After removing bad words there are 63 unique names from the 2014-01-28 00:00:00 meeting


In [15]:
#order all columns of the present column alphabetically
df['present'] = df['present'].apply(sorted)

### <center> Attendance Dataframe </center>
#### A dataframe of all the economist that were present for FOMC meetings since 2014
The columns reprsent the date of the FOMC meeting (1/2014 - 7/2019). The rows represnet the one of 319 economist that has been present for a FOMC meeting since 2014.

In [16]:
# Make a new empty DateFrame to append values of attendance
present_df = pd.DataFrame()

In [17]:
# Loop over each list of the df.present column
for name in range(len(df)):
    empty = []
        #create a empty list to append the attendance for each meeting    
        
    # Loop over all names from the complete list
    for all_names in range(len(complete_names)):
        
        # if name in complete list is in the df.present list
        if complete_names[all_names] in df.iloc[name, -1]:
            empty.append(1)
        else:
            empty.append(0)
    empty_df = pd.DataFrame(empty)
    present_df = pd.concat([present_df, empty_df], axis= 1)
        # Append the list for given FOMC meeting to the empty present_df dataframe

In [18]:
# Make date columns by spliting the :oo:oo:00 from the original date values
date_columns = []
for i in range(len(df)):
    date_columns.append(str(df.iloc[i, 0]).split(' ')[0])

In [19]:
present_df.columns = date_columns
present_df.index = complete_names

In [20]:
present_df['Total'] = present_df.sum(axis = 1)

In [21]:
present_df.loc['Total']= present_df.sum()

In [22]:
present_df = present_df.T
    #Transpose df

In [23]:
present_df.head()

Unnamed: 0,A. Lee Smith,Achilles Sangster II,Alberto G. Musalem,Alexander L. Wolman,Alyssa G. Anderson,Andre Anderson,Andrea Ajello,Andrea Raffo,Andrea Tambalotti,Andreas L. Hornstein,...,William C. Dudley,William Dupor,William E. Riordan,William F. Bassett,William Nelson,William R. Nelson,William Wascher,Yuriy Kitsul,Zeynep Senyuz,Total
2014-01-28,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,1,0,63
2014-03-18,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,61
2014-04-29,0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,1,0,1,0,0,66
2014-06-17,0,1,0,0,0,0,0,0,0,0,...,1,0,0,1,0,1,1,0,0,73
2014-07-29,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,1,0,0,69


In [24]:
len(df.iloc[0,-1])

63

In [25]:
present_string = present_df.replace([1,0], ['Present', 'Absent'])
present_string = present_string.drop('Total')
present_string = present_string.drop(columns = 'Total')

#write out to csv
present_string.to_csv('present_list.csv')

### Test to check if count of names is correct

In [26]:
def name_counter(names):
    name_count = 0
    for name in range(len(df)):
        if names in df.iloc[name, -1]:
            name_count += 1
    return names, " appears ", name_count 

In [27]:
name_counter('Marie Gooding')

('Marie Gooding', ' appears ', 3)