# P-tag Extraction

This notebook will extract all p-tags from the XML files from the docs directory. 

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import sklearn as skl
import gensim
import os
from bs4 import BeautifulSoup

We need to iterate through the *docs* directory and extract the data from each XML file in that directory. We will use *os* to iterate through it and extract the file names. 

In [32]:
def extract_filenames(dir):
    result = []
    
    for filename in os.listdir(f'./{dir}'):
        result.append(filename.split('.')[0])
        
    return np.array(result)

In [37]:
def extract_dates():
    df = pd.read_csv('financial_services_rules.csv', usecols=['document_number', 
                                                              'title', 
                                                              'type', 
                                                              'abstract', 
                                                              'publication_date', 
                                                              'agencies', 
                                                              'excerpts'])
    df['publication_date']= pd.to_datetime(df['publication_date'])
    df = df.sort_values(by='publication_date')
    df.reset_index(inplace=True, drop=True)
    
    return df

In [21]:
def extract_p_tags_from_file(filename):
    result = []
    
    infile = open(filename, encoding="utf8")
    contents = infile.read()
    soup = BeautifulSoup(contents, 'xml')
    p_tags = soup.find_all("P")
    
    for p in p_tags:
        result.append(p.get_text())
    
    return result

In [54]:
def extract_p_tags(dir):
    results = []
    dates = []
    
    filenames = extract_filenames(dir)
    d_df = extract_dates()
    
    for filename in filenames:
        p_tags = extract_p_tags_from_file(f'./{dir}/{filename}.xml')
        results.extend(p_tags)
        
        date = d_df[d_df['document_number'] == str(filename)]['publication_date'].iloc[0]
        dates.extend([date for i in range(len(p_tags))])
        
        print(f"{filename} complete")
        
    results_df = pd.DataFrame()
    results_df['P-tags'] = results
    results_df['Timestamp'] = dates
    
    return results_df

## Testing Section

This section of the notebook will include tests for the helper functions defined above. 

In [33]:
filenames = extract_filenames('docs')

In [43]:
filenames[0]

'01-10222'

In [38]:
dates = extract_dates()

In [47]:
dates[dates['document_number'] == str(filenames[0])]['publication_date'].iloc[0]

Timestamp('2001-04-25 00:00:00')

In [None]:
results_df = extract_p_tags('docs')

01-10222 complete
01-10398 complete
01-10407 complete
01-1073 complete
01-11005 complete
01-1114 complete
01-11333 complete
01-11572 complete
01-11607 complete
01-11608 complete
01-11609 complete
01-11610 complete
01-11861 complete
01-12084 complete
01-12131 complete
01-12278 complete
01-12388 complete
01-12489 complete
01-12689 complete
01-12696 complete
01-1305 complete
01-13316 complete
01-13373 complete
01-13526 complete
01-13586 complete
01-13723 complete
01-14218 complete
01-14529 complete
01-14830 complete
01-15137 complete
01-15272 complete
01-15455 complete
01-15724 complete
01-15869 complete
01-15978 complete
01-16018 complete
01-1614 complete
01-16328 complete
01-16329 complete
01-16330 complete
01-16501 complete
01-16869 complete
01-17000 complete
01-17171 complete
01-17302 complete
01-17425 complete
01-175 complete
01-17904 complete
01-18033 complete
01-18261 complete
01-18356 complete
01-18357 complete
01-1906 complete
01-19496 complete
01-19497 complete
01-19521 complete

04-1198 complete
04-12042 complete
04-12317 complete
04-12521 complete
04-12727 complete
04-12922 complete
04-13084 complete
04-13147 complete
04-1323 complete
04-13276 complete
04-13290 complete
04-13314 complete
04-13412 complete
04-13413 complete
04-13678 complete
04-13965 complete
04-14138 complete
04-14406 complete
04-14504 complete
04-14505 complete
04-14755 complete
04-14815 complete
04-15081 complete
04-15523 complete
04-15526 complete
04-15580 complete
04-15585 complete
04-15757 complete
04-15782 complete
04-15875 complete
04-15950 complete
04-1605 complete
04-16319 complete
04-16401 complete
04-16441 complete
04-1669 complete
04-16818 complete
04-16888 complete
04-17051 complete
04-17112 complete
04-17362 complete
04-17459 complete
04-17460 complete
04-17571 complete
04-18118 complete
04-18344 complete
04-18349 complete
04-18413 complete
04-18449 complete
04-18650 complete
04-18681 complete
04-18754 complete
04-18888 complete
04-19021 complete
04-19258 complete
04-19575 compl

2010-27036 complete
2010-27191 complete
2010-27532 complete
2010-27533 complete
2010-27538 complete
2010-27541 complete
2010-27547 complete
2010-27555 complete
2010-27657 complete
2010-28136 complete
2010-28186 complete
2010-28303 complete
2010-28327 complete
2010-2856 complete
2010-28627 complete
2010-29003 complete
2010-29006 complete
2010-29009 complete
2010-29021 complete
2010-29022 complete
2010-29023 complete
2010-29024 complete
2010-29137 complete
2010-29138 complete
2010-29277 complete
2010-29702 complete
2010-29710 complete
2010-29719 complete
2010-29831 complete
2010-29836 complete
2010-29880 complete
2010-29883 complete
2010-29956 complete
2010-29957 complete
2010-29994 complete
2010-30077 complete
2010-30078 complete
2010-30476 complete
2010-30590 complete
2010-30884 complete
2010-31014 complete
2010-31029 complete
2010-31130 complete
2010-31131 complete
2010-31133 complete
2010-31458 complete
2010-31529 complete
2010-31530 complete
2010-31578 complete
2010-31579 complete
2

2012-17603 complete
2012-17663 complete
2012-17763 complete
2012-17854 complete
2012-17860 complete
2012-17918 complete
2012-17985 complete
2012-18003 complete
2012-18382 complete
2012-18383 complete
2012-18726 complete
2012-18762 complete
2012-18827 complete
2012-19664 complete
2012-19702 complete
2012-19974 complete
2012-19977 complete
2012-20089 complete
2012-20422 complete
2012-20432 complete
2012-20508 complete
2012-20531 complete
2012-20808 complete
2012-20962 complete
2012-21153 complete
2012-21155 complete
2012-21414 complete
2012-21606 complete
2012-21681 complete
2012-21805 complete
2012-21998 complete
2012-22000 complete
2012-23688 complete
2012-24276 complete
2012-24375 complete
2012-24608 complete
2012-24952 complete
2012-24987 complete
2012-24988 complete
2012-24998 complete
2012-25116 complete
2012-25123 complete
2012-25194 complete
2012-25315 complete
2012-25495 complete


In [53]:
results_df

Unnamed: 0,P-tags,Timestamp
0,Commodity Futures Trading Commission.,2001-04-25
1,Final rules.,2001-04-25
2,Pursuant to section 111 of the Commodity Futur...,2001-04-25
3,"June 19, 2001.",2001-04-25
4,"Lawrence B. Patent, Associate Chief Counsel, o...",2001-04-25
...,...,...
89,(i) All trades or positions of the customer wi...,2001-04-25
90,"(ii) All money, securities, or property held i...",2001-04-25
91,(d) Each futures commission merchant shall mai...,2001-04-25
92,"7 U.S.C. 1a, 2, 4a, 6c, 6d, 6g, 7, 7a, 12, 19,...",2001-04-25
