In [1]:
from get_white_house_texts import get_data_whpb
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import re
import nltk
import clean_white_house as clean_wh
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
import pprint
from __future__ import division

pd.set_option('mode.chained_assignment', None)

In [34]:
briefings = pd.read_csv('briefings-for-cleaning.csv')
remarks = pd.read_csv('remarks-for-cleaning.csv')

In [35]:
briefings.shape

(251508, 5)

In [36]:
remarks.shape

(157344, 5)

# Cleaning

In [37]:
## takes care of most of the cleaning
# gets rid of rows with the type of document, what time it is, when the end is, etc
remarks = clean_wh.start_table_cleaning(remarks)
briefings = clean_wh.start_table_cleaning(briefings)

In [38]:
remarks.shape

(146836, 5)

In [39]:
briefings.shape

(197821, 5)

In [40]:
remarks = remarks[~remarks.text.str.contains(
    '^(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2}.{1,2}\d{4}'
)]

remarks.reset_index(drop=True, inplace=True)

briefings = briefings[~briefings.text.str.contains(
    '^(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2}.{1,2}\d{4}'
)]

briefings.reset_index(drop=True, inplace=True)

  
  


In [41]:
remarks.shape

(146767, 5)

In [42]:
briefings.shape

(197748, 5)

In [43]:
remarks[remarks.text.str.contains('A.M', regex=False)]

Unnamed: 0,title,date,text,link,doc_id


In [44]:
remarks[remarks.text.str.contains('P.M', regex=False)]

Unnamed: 0,title,date,text,link,doc_id
98,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,1:56P.M. EST,https://www.whitehouse.gov/briefings-statement...,4


In [45]:
briefings[briefings.text.str.contains('A.M', regex=False)]

Unnamed: 0,title,date,text,link,doc_id


In [46]:
briefings[briefings.text.str.contains('P.M', regex=False)]

Unnamed: 0,title,date,text,link,doc_id
206,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,1:56P.M. EST,https://www.whitehouse.gov/briefings-statement...,62


In [47]:
# Those were the exact same rows. I'm going to drop any duplicates by concating 
remarks = remarks[~remarks.text.str.contains('P.M', regex=False)]
briefings = briefings[~briefings.text.str.contains('P.M', regex=False)]

In [48]:
remarks.text = remarks.text.replace('^Q$', 'PRESS CORPS: ', regex=True)
briefings.text = briefings.text.replace('^Q$', 'PRESS CORPS: ', regex=True)

In [49]:
remarks['table'] = 'remarks'
briefings['table'] = 'briefings'

In [50]:
together = pd.concat([remarks, briefings])
## anything that appears in both tables we will keep the remarks copy. 
together.drop_duplicates(subset=['title', 'date'], keep='first', inplace=True)
together = together[together.table == 'briefings']

In [51]:
print(f'Briefing table shape before dropping duplicates: {briefings.shape}')

briefing_keeps = together.doc_id.tolist()

briefings['doc_keep_bool'] = briefings.doc_id.map(lambda x: True if x in briefing_keeps else False)

briefings = briefings[briefings.doc_keep_bool == True]

briefings.drop(['doc_keep_bool', 'table'], axis=1, inplace=True)
remarks.drop(['table'], axis=1, inplace=True)

print(f'Briefing table shape after dropping duplicates: {briefings.shape}')

Briefing table shape before dropping duplicates: (197747, 6)
Briefing table shape after dropping duplicates: (49228, 5)


In [57]:
remarks

Unnamed: 0,title,date,text,link,doc_id
0,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,James S. Brady Press Briefing Room,https://www.whitehouse.gov/briefings-statement...,0
1,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,MS. MCENANY: I am here to deliver this message...,https://www.whitehouse.gov/briefings-statement...,0
2,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,I stood here at this podium the day after a hi...,https://www.whitehouse.gov/briefings-statement...,0
3,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,We grieve for the loss of life and those injur...,https://www.whitehouse.gov/briefings-statement...,0
4,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,What we saw yesterday was a group of violent r...,https://www.whitehouse.gov/briefings-statement...,0
5,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,Those who are working in this building are wor...,https://www.whitehouse.gov/briefings-statement...,0
6,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,Thank you very much.,https://www.whitehouse.gov/briefings-statement...,0
7,Remarks by Vice President Pence at the Space F...,2020-12-18,"THE VICE PRESIDENT: Well, thank you all for be...",https://www.whitehouse.gov/briefings-statement...,1
8,Remarks by Vice President Pence at the Space F...,2020-12-18,Many kind things have been said about your Vic...,https://www.whitehouse.gov/briefings-statement...,1
9,Remarks by Vice President Pence at the Space F...,2020-12-18,We just left the Oval Office with the Presiden...,https://www.whitehouse.gov/briefings-statement...,1


In [62]:
remarks['total_words'] = remarks.text.map(lambda x: len(x.split(' ')))
remarks['first_letter_up'] = remarks.text.map(lambda x: [i[0].isupper() for i in x.split(' ')])
remarks['cap_first_by_total'] = remarks['first_letter_up'] / remarks['total_words']

In [21]:
delete_lines = remarks.groupby('doc_id', as_index=False).first().apply(
    lambda x: x['text'] if len(x['text'].split(':')) == 1 else np.nan, axis=1).dropna().tolist()

In [22]:
len(set(delete_lines))

1

In [23]:
second_line = remarks.groupby('doc_id', as_index=False).nth(1)
third_line = remarks.groupby('doc_id', as_index=False).nth(2)

In [24]:
delete_lines += second_line[(second_line.cap_first_by_total == 1) & 
                                (~second_line.text.str.contains(':')) ].text.unique().tolist()

In [25]:
len(set(delete_lines))

1

In [26]:
delete_lines += third_line[(third_line.total_words < 15) & 
           (~third_line.text.str.contains(':')) & 
           (third_line.cap_first_by_total > .5)].text.tolist()

In [27]:
len(set(delete_lines))

1

In [28]:
remarks['delete_bool'] = remarks.text.map(lambda x: True if x in delete_lines else False)
# doing the same for briefings while I'm at it
briefings['delete_bool'] = briefings.text.map(lambda x: True if x in delete_lines else False)

remarks = remarks[remarks.delete_bool == False]
briefings = briefings[briefings.delete_bool == False]

remarks.drop(['delete_bool'], axis=1, inplace=True)
briefings.drop(['delete_bool'], axis=1, inplace=True)

remarks.reset_index(drop=True, inplace=True)
briefings.reset_index(drop=True, inplace=True)

In [29]:
remarks.shape

(76, 8)

In [30]:
remarks[remarks.title.str.contains('Statement')]

Unnamed: 0,title,date,text,link,doc_id,first_letter_up,total_words,cap_first_by_total


In [31]:
first_line = remarks.groupby('doc_id', as_index=False).first()

In [32]:
delete_lines = first_line[(~first_line.text.str.contains(':')) & 
                          (first_line.total_words < 45) &
                          (first_line.cap_first_by_total > .3) & 
                          (~first_line.text.str.contains('hello'))].text.tolist()

remarks['delete_bool'] = remarks.text.map(lambda x: True if x in delete_lines else False)
# doing the same for briefings while I'm at it
briefings['delete_bool'] = briefings.text.map(lambda x: True if x in delete_lines else False)

remarks = remarks[remarks.delete_bool == False]
briefings = briefings[briefings.delete_bool == False]

remarks.drop(['delete_bool'], axis=1, inplace=True)
briefings.drop(['delete_bool'], axis=1, inplace=True)

remarks.reset_index(drop=True, inplace=True)
briefings.reset_index(drop=True, inplace=True)

In [33]:
print(remarks.shape)
print(briefings.shape)

(75, 8)
(195117, 5)


In [None]:
remarks.to_csv('clean-remarks-1-14.csv', index=False)
briefings.to_csv('clean-briefings-1-14.csv', index=False)

In [153]:
remarks = pd.read_csv('clean-remarks-1-14.csv')
briefings = pd.read_csv('clean-briefings-1-14.csv')

In [154]:
print(remarks.shape)
print(briefings.shape)

(144623, 8)
(48898, 5)


In [155]:
remarks = remarks[~remarks.title.str.contains('Message from')]
briefings = briefings[~briefings.title.str.contains('Message from')]

remarks.reset_index(drop=True, inplace=True)
briefings.reset_index(drop=True, inplace=True)

In [156]:
remarks[remarks.text.str.contains('\)')]

Unnamed: 0,title,date,text,link,doc_id,first_letter_up,total_words,cap_first_by_total
49916,Remarks by Vice President Pence and Prime Mini...,2020-01-23,The flags are presented.),https://www.whitehouse.gov/briefings-statement...,410,1,4,0.25
113953,Remarks by President Trump at Swearing-In Cere...,2018-05-21,The Oath of Office is administered.),https://www.whitehouse.gov/briefings-statement...,1313,3,6,0.5
129829,Remarks by President Trump at Meeting with Hou...,2017-11-02,President is given a document.),https://www.whitehouse.gov/briefings-statement...,1610,1,5,0.2


In [157]:
remarks = remarks[~remarks.text.str.contains('\)')]
remarks.reset_index(drop=True, inplace=True)

In [158]:
briefings[briefings.text.str.contains('\)')]

Unnamed: 0,title,date,text,link,doc_id
42200,Daily Press Briefing by Press Secretary Sean S...,2017-03-28,"A few things I want to highlight. Last night, ...",https://www.whitehouse.gov/briefings-statement...,6350


In [159]:
briefings.iloc[42200, 2]

'A few things I want to highlight. Last night, the President announced his intent to nominate Makan Delrahim to serve as Assistant Attorney General of the Anti-Trust Divisio) at the Department of Justice.'

In [160]:
briefings.iloc[42200, 2] = briefings.iloc[42200, 2].replace(')', 'n')

In [161]:
remarks[remarks.text.str.contains('(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},? \d{4}')]

  """Entry point for launching an IPython kernel.


Unnamed: 0,title,date,text,link,doc_id,first_letter_up,total_words,cap_first_by_total
2017,Remarks by President Trump to Guests at the Al...,2020-10-01,Few institutions in history have done more for...,https://www.whitehouse.gov/briefings-statement...,39,13,86,0.151163
2738,Remarks by President Trump Honoring Bay of Pig...,2020-09-23,"On April 17, 1961, the 1,400 Cuban exiles of B...",https://www.whitehouse.gov/briefings-statement...,51,13,61,0.213115
3998,Remarks by President Trump at Presentation of ...,2020-09-11,MILITARY AIDE: Attention to orders. The Medal ...,https://www.whitehouse.gov/briefings-statement...,71,16,43,0.372093
7709,Remarks by Vice President Pence at a Support L...,2020-08-14,"I must tell you, the world saw the character o...",https://www.whitehouse.gov/briefings-statement...,108,10,112,0.089286
15072,Remarks by Vice President Pence at a Naturaliz...,2020-07-02,"You see, he came to this country — stepped off...",https://www.whitehouse.gov/briefings-statement...,180,13,74,0.175676
15758,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,"[1] John Garnaut, “Engineers of the Soul: Ideo...",https://www.whitehouse.gov/briefings-statement...,189,8,16,0.500000
15759,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,"[2] John Garnaut, “Engineers of the Soul: Ideo...",https://www.whitehouse.gov/briefings-statement...,189,8,16,0.500000
15760,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,"[3] John Garnaut, “Engineers of the Soul: Ideo...",https://www.whitehouse.gov/briefings-statement...,189,8,16,0.500000
15761,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,"[4] “Document 9: A ChinaFile Translation,” Nov...",https://www.whitehouse.gov/briefings-statement...,189,4,10,0.400000
15762,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,"[5] “Document 9: A ChinaFile Translation,” Nov...",https://www.whitehouse.gov/briefings-statement...,189,4,10,0.400000


In [162]:
remarks[remarks.doc_id == 189]

Unnamed: 0,title,date,text,link,doc_id,first_letter_up,total_words,cap_first_by_total
15705,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,"Well, thank you Governor. That was an extraord...",https://www.whitehouse.gov/briefings-statement...,189,27,195,0.138462
15706,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,It is wonderful to be here at the Arizona Comm...,https://www.whitehouse.gov/briefings-statement...,189,21,110,0.190909
15707,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,I bring you greetings from the 45th President ...,https://www.whitehouse.gov/briefings-statement...,189,16,64,0.250000
15708,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,I appreciate the kind invitation to come discu...,https://www.whitehouse.gov/briefings-statement...,189,22,88,0.250000
15709,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,"America, under President Trump’s leadership, h...",https://www.whitehouse.gov/briefings-statement...,189,14,100,0.140000
15710,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,It was under this premise that we welcomed Chi...,https://www.whitehouse.gov/briefings-statement...,189,12,50,0.240000
15711,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,"As China grew richer and stronger, we believed...",https://www.whitehouse.gov/briefings-statement...,189,10,53,0.188679
15712,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,We could not have been more wrong—and this mis...,https://www.whitehouse.gov/briefings-statement...,189,7,40,0.175000
15713,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,The answer is simple: because we did not pay h...,https://www.whitehouse.gov/briefings-statement...,189,6,54,0.111111
15714,The Chinese Communist Party’s Ideology and Glo...,2020-06-26,"Let us be clear, the Chinese Communist Party i...",https://www.whitehouse.gov/briefings-statement...,189,30,101,0.297030


In [163]:
remarks = remarks[~remarks.text.str.contains('^\[.+\]')]
remarks.reset_index(drop=True, inplace=True)

In [164]:
briefings[briefings.text.str.contains('^\[.+\]')]

Unnamed: 0,title,date,text,link,doc_id


In [165]:
remarks = remarks[remarks.text != '.']
remarks.reset_index(drop=True, inplace=True)

In [166]:
last_line = remarks.groupby('doc_id', as_index=False).last()

In [167]:
last_line[(~last_line.text.str.contains('[Tt]hank')) &
          (~last_line.text.str.contains('[:?]')) &  
          (~last_line.text.str.contains('So')) & 
          (~last_line.text.str.contains('God')) & 
          (~last_line.text.str.contains('you')) & 
          (~last_line.text.str.contains('[Gg]oodbye'))]

Unnamed: 0,doc_id,title,date,text,link,first_letter_up,total_words,cap_first_by_total
35,35,Remarks by President Trump Before Marine One D...,2020-10-15,But we won — we won a big case — $7.5 billion.,https://www.whitehouse.gov/briefings-statement...,1,12,0.083333
63,63,Remarks by President Trump Before Marine One D...,2020-09-15,The governor of Nevada should not be in charge...,https://www.whitehouse.gov/briefings-statement...,7,71,0.098592
69,69,Remarks by President Trump in a Ceremony Recog...,2020-09-14,The brave actions and superior airmanship of t...,https://www.whitehouse.gov/briefings-statement...,16,69,0.231884
71,71,Remarks by President Trump at Presentation of ...,2020-09-11,Sergeant First Class Payne’s gallantry under f...,https://www.whitehouse.gov/briefings-statement...,12,37,0.324324
144,144,Remarks by President Trump at the Presentation...,2020-07-24,"Following his success on the track, Mr. Ryun c...",https://www.whitehouse.gov/briefings-statement...,9,50,0.18
161,161,Remarks by Vice President Pence During a Round...,2020-07-14,"With that, I’d be happy to recognize the Secre...",https://www.whitehouse.gov/briefings-statement...,6,21,0.285714
173,173,Remarks by President Trump and President López...,2020-07-08,Long live the friendship of our two nations. L...,https://www.whitehouse.gov/briefings-statement...,15,30,0.5
248,248,Remarks by President Trump at Presidential Rec...,2020-05-15,"Okay, please.",https://www.whitehouse.gov/briefings-statement...,1,2,0.5
319,319,Remarks by Vice President Pence in a Press Gaggle,2020-03-23,"President Trump and I absolutely believe, for ...",https://www.whitehouse.gov/briefings-statement...,11,53,0.207547
353,353,Remarks by President Trump After Marine One Ar...,2020-03-03,"He gave a very bad signal, in my opinion.",https://www.whitehouse.gov/briefings-statement...,1,9,0.111111


In [168]:
remarks = remarks[~remarks.text.str.contains('WHITE HOUSE')]

In [169]:
remarks = remarks[~remarks.text.str.contains('Cited')]

In [170]:
remarks[remarks.doc_id == 1499].iloc[-1]

title                 Remarks by LTG H.R. McMaster at the United Sta...
date                                                         2018-01-21
text                  I commend all of you for choosing service to o...
link                  https://www.whitehouse.gov/briefings-statement...
doc_id                                                             1499
first_letter_up                                                       4
total_words                                                          34
cap_first_by_total                                             0.117647
Name: 125234, dtype: object

In [171]:
remarks[remarks.title.str.contains('State of the Union')].doc_id.unique()

array([ 393,  988,  990, 1482, 1483])

In [172]:
remarks[remarks.title.str.contains('State of the Union')].groupby('doc_id').first()

Unnamed: 0_level_0,title,date,text,link,first_letter_up,total_words,cap_first_by_total
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
393,Remarks by President Trump in State of the Uni...,2020-02-04,THE PRESIDENT: Thank you very much. Thank you....,https://www.whitehouse.gov/briefings-statement...,5,12,0.416667
988,Remarks by President Trump in State of the Uni...,2019-02-06,"THE PRESIDENT: Madam Speaker, Mr. Vice Preside...",https://www.whitehouse.gov/briefings-statement...,14,23,0.608696
990,President Donald J. Trump’s State of the Union...,2019-02-05,TO THE CONGRESS OF THE UNITED STATES:,https://www.whitehouse.gov/briefings-statement...,7,7,1.0
1482,Remarks by President Trump in State of the Uni...,2018-01-30,"THE PRESIDENT: Mr. Speaker, Mr. Vice President...",https://www.whitehouse.gov/briefings-statement...,14,21,0.666667
1483,President Donald J. Trump’s State of the Union...,2018-01-30,TO THE CONGRESS OF THE UNITED STATES:,https://www.whitehouse.gov/briefings-statement...,7,7,1.0


In [173]:
remarks = remarks[~remarks.title.str.contains("President Donald J. Trump’s State of the Union")]
remarks.reset_index(drop=True, inplace=True)

In [174]:
remarks = remarks[~remarks.text.str.contains("As prepared")]
remarks.reset_index(drop=True, inplace=True)

In [175]:
## these documents don't begin with a speaker that I can use to make a transcript style table so they are going to 
## be deleted
remarks.groupby('doc_id', as_index=False).first().apply(
    lambda x: (x['doc_id'], x['text']) if len(x['text'].split(':')) == 1 else np.nan, axis=1).dropna()

7       (7, And before I begin, allow me to bring gree...
30      (30, I’d like to thank Dean Godson and Policy ...
151     (151, As we see rising coronavirus cases not j...
154     (154, You know, in our first three years, Pres...
162     (162, I remember the early conversations with ...
189     (189, Well, thank you Governor. That was an ex...
220     (220, So we look forward to being there. Some ...
222     (222, I think that, before I leave, I want to ...
223     (223, Before going further on this exciting da...
267     (267, Good morning everyone. I’m Matt Pottinge...
470     (470, Thank you for the kind introduction. I a...
471     (471, Well, thank you Mort for that very kind ...
944     (945, Earlier today, Mrs. Trump participated i...
1032    (1034, Well, Kim, thanks very much, and thanks...
1081    (1084, Thank you, Ingeborg, for your kind intr...
1093    (1096, I’ve been following very closely the ev...
1205    (1208, Good morning. I’m pleased to be here to...
1330    (1333,

In [176]:
remarks.head()

Unnamed: 0,title,date,text,link,doc_id,first_letter_up,total_words,cap_first_by_total
0,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,MS. MCENANY: I am here to deliver this message...,https://www.whitehouse.gov/briefings-statement...,0,12,72,0.166667
1,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,I stood here at this podium the day after a hi...,https://www.whitehouse.gov/briefings-statement...,0,10,66,0.151515
2,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,We grieve for the loss of life and those injur...,https://www.whitehouse.gov/briefings-statement...,0,3,37,0.081081
3,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,What we saw yesterday was a group of violent r...,https://www.whitehouse.gov/briefings-statement...,0,6,69,0.086957
4,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,Those who are working in this building are wor...,https://www.whitehouse.gov/briefings-statement...,0,6,42,0.142857


In [177]:
doc_ids_delete = remarks.groupby('doc_id', as_index=False).first().apply(
    lambda x: x['doc_id'] if len(x['text'].split(':')) == 1 else np.nan, axis=1).dropna().tolist()

remarks['delete_docs'] = remarks.doc_id.map(lambda x: True if x in doc_ids_delete else False)

remarks = remarks[remarks.delete_docs == False]

remarks.drop(['first_letter_up', 'total_words', 'cap_first_by_total', 'delete_docs'], axis=1, inplace=True)

remarks.reset_index(drop=True, inplace=True)

In [178]:
remarks.shape

(143044, 5)

In [179]:
## starting to clean the rest of briefings
briefings['total_words'] = briefings.text.map(lambda x: len(x.split(' ')))
briefings['first_letter_up'] = briefings.text.map(lambda x: sum([i[0].isupper() if len(i) > 0 else 0 for i in x.split(' ')]))
briefings['cap_first_by_total'] = briefings['first_letter_up'] / briefings['total_words']

In [131]:
min(briefings.first_letter_up)

0

In [180]:
first_line = briefings.groupby('doc_id', as_index=False).first()

In [181]:
# these are docs that don't have a speaker at the start of the first line so they are going to be deleted
first_line[(~first_line.text.str.contains(':')) & (first_line.cap_first_by_total < .75)]

Unnamed: 0,doc_id,title,date,text,link,total_words,first_letter_up,cap_first_by_total
3,123,Press Briefing by Vice President Pence and Mem...,2020-11-19,"I’m joined today, and you’ll hear in a few mom...",https://www.whitehouse.gov/briefings-statement...,72,15,0.208333
29,678,Remarks by President Trump on the Jobs Numbers...,2020-07-02,I’d like to just announce the spectacular news...,https://www.whitehouse.gov/briefings-statement...,53,8,0.150943
285,6201,On-the-Record Press Call on the Education Fede...,2017-04-26,"Hi, this is Rob Goad. I’m a senior Department ...",https://www.whitehouse.gov/briefings-statement...,32,9,0.28125
349,6622,Statement by Press Secretary Sean Spicer,2017-01-31,Good afternoon. I know you’re all looking forw...,https://www.whitehouse.gov/briefings-statement...,44,9,0.204545


In [182]:
# and all the locations and lines below can be deleted as well
first_line[(~first_line.text.str.contains(':')) & (first_line.cap_first_by_total > .75)].text.unique()

array(['James S. Brady Press Briefing Room February 26, 2020',
       'Aboard Air Force One En Route Washington, D.C.',
       'Aboard Air Force One En Route Fargo, North Dakota',
       'Aboard Air Force One En Route Tampa, Florida',
       'En Route Dubuque, Iowa',
       'Aboard Air Force One En Route Lewisburg, West Virginia',
       'JW Marriott Singapore',
       'Aboard Air Force One En Route Dallas, Texas',
       'Aboard Air Force One En Route Nashville, Tennessee',
       'Aboard Air Force One En Route South Bend, Indiana',
       'Hilton West Palm Beach',
       'Aboard Air Force One En Route Cleveland, Ohio',
       'Aboard Air Force One En Route St. Louis, Missouri',
       'Aboard Air Force One En Route Cincinnati, Ohio',
       'En Route Pittsburgh, Pennsylvania',
       'En Route Nashville, Tennessee', 'En Route Pensacola, Florida',
       'Aboard Air Force One En Route Da Nang, Vietnam',
       'China World Hotel Beijing, China', 'En Route Beijing, China',
       'New 

In [184]:
print(f'Table shape before dropping rows: {briefings.shape}')

docs_delete = first_line[(~first_line.text.str.contains(':')) & 
                         (first_line.cap_first_by_total < .75)].doc_id.tolist()

lines_delete = first_line[(~first_line.text.str.contains(':')) & 
                         (first_line.cap_first_by_total > .75)].text.tolist()

briefings['doc_delete'] = briefings.doc_id.map(lambda x: True if x in docs_delete else False)
briefings['row_delete'] = briefings.text.map(lambda x: True if x in lines_delete else False)

briefings = briefings[briefings.doc_delete == False]
briefings = briefings[briefings['row_delete'] == False]

briefings.drop(['doc_delete', 'row_delete', 'total_words', 'first_letter_up', 'cap_first_by_total'], 
               axis=1, inplace=True)


briefings.reset_index(drop=True, inplace=True)

print(f'Table shape after dropping rows: {briefings.shape}')

Table shape before dropping rows: (48898, 10)
Table shape after dropping rows: (48330, 8)


In [186]:
first_line = briefings.groupby('doc_id', as_index=False).first()

In [191]:
first_line[~first_line.text.str.contains(':')]

Unnamed: 0,doc_id,title,date,text,link,total_words,first_letter_up,cap_first_by_total
231,5652,Press Gaggle by Press Secretary Sarah Sanders ...,2017-08-22,"Aboard Air Force One En Route Yuma, Arizona",https://www.whitehouse.gov/briefings-statement...,8,8,1.0


In [196]:
briefings = briefings[~briefings.text.str.contains('En Route')]

briefings.reset_index(drop=True, inplace=True)

In [197]:
# Hopefully the last time checking this
first_line = briefings.groupby('doc_id', as_index=False).first()
first_line[~first_line.text.str.contains(':')]

Unnamed: 0,doc_id,title,date,text,link,total_words,first_letter_up,cap_first_by_total


In [198]:
last_line = briefings.groupby('doc_id', as_index=False).last()

In [203]:
# The last lines look all good as well so our remarks and briefings are cleaned up now.
last_line[(~last_line.text.str.contains(':')) & (~last_line.text.str.contains('[Tt]hank'))]

Unnamed: 0,doc_id,title,date,text,link,total_words,first_letter_up,cap_first_by_total
0,60,Press Briefing by Press Secretary Kayleigh McE...,2020-12-16,Really interesting turn of events and good for...,https://www.whitehouse.gov/briefings-statement...,21,2,0.095238
4,267,Press Briefing by Press Secretary Kayleigh McE...,2020-10-01,Ideas do no target police officers. Ideas do n...,https://www.whitehouse.gov/briefings-statement...,38,9,0.236842
5,302,Press Briefing by Press Secretary Kayleigh McE...,2020-09-24,"Our police officers deserve our respect, and t...",https://www.whitehouse.gov/briefings-statement...,59,5,0.084746
22,592,Press Briefing by Press Secretary Kayleigh McE...,2020-07-21,"And Dr. Birx, for weeks, has been sending out ...",https://www.whitehouse.gov/briefings-statement...,74,9,0.121622
30,690,Press Briefing by Press Secretary Kayleigh McE...,2020-06-29,"It is inexcusable, the failed Russia reporting...",https://www.whitehouse.gov/briefings-statement...,31,13,0.419355
39,825,Press Briefing by Press Secretary Kayleigh McE...,2020-05-23,It’s a long weekend. You guys have three days ...,https://www.whitehouse.gov/briefings-statement...,43,8,0.186047
46,949,"Remarks by President Trump, Vice President Pen...",2020-04-20,So it’s a very good question. I appreciate it....,https://www.whitehouse.gov/briefings-statement...,17,4,0.235294
52,1164,Press Briefing by Vice President Pence and Mem...,2020-03-06,"So although your numbers are correct, it isn’t...",https://www.whitehouse.gov/briefings-statement...,15,2,0.133333
56,1270,Press Briefing by Acting OMB Director Russ Vought,2020-02-10,We’re going to keep proposing these types of b...,https://www.whitehouse.gov/briefings-statement...,34,2,0.058824
152,4659,Remarks by Vice President Pence in Press Gaggl...,2018-02-08,The backdrop of his discussions with North Kor...,https://www.whitehouse.gov/briefings-statement...,67,12,0.179104


In [3]:
remarks = pd.read_csv('remarks-pickup.csv')
briefings = pd.read_csv('briefings-pickup.csv')

In [4]:
remarks.date = pd.to_datetime(remarks.date)
briefings.date = pd.to_datetime(briefings.date)

# Making transcript style table

In [5]:
new_remarks = clean_wh.create_transcript_table(remarks)
new_briefings = clean_wh.create_transcript_table(briefings)

In [6]:
remarks_first = new_remarks.groupby('doc_id', as_index=False).first()
briefings_first = new_briefings.groupby('doc_id', as_index=False).first()

In [7]:
remarks_first[remarks_first.speaker.isnull()]

Unnamed: 0,doc_id,title,date,link,speaker,text,original_text
169,174,Statement from the Press Secretary Regarding S...,2020-07-08,https://www.whitehouse.gov/briefings-statement...,,"Since Day One, the Trump Administration has so...","Since Day One, the Trump Administration has so..."
174,179,Remarks by Vice President in Briefing with Gov...,2020-07-02,https://www.whitehouse.gov/briefings-statement...,,"Governor, the President wanted me to be here t...","Governor, the President wanted me to be here t..."
719,731,Remarks by White House Senior Advisor Jared Ku...,2019-07-04,https://www.whitehouse.gov/briefings-statement...,,"Thank you, very much. And thank you to everyon...","Thank you, very much. And thank you to everyon..."
1051,1067,Remarks by Vice President Pence at the 2018 AP...,2018-11-16,https://www.whitehouse.gov/briefings-statement...,,"Thank you, Scott. And thank you all for that w...","Thank you, Scott. And thank you all for that w..."
1200,1220,Remarks by Vice President Pence at Ministerial...,2018-07-26,https://www.whitehouse.gov/briefings-statement...,,"Thank you, Secretary Pompeo. To the Secretary,...","Thank you, Secretary Pompeo. To the Secretary,..."
1793,1829,Remarks by the Vice President at Grove City Co...,2017-05-20,https://www.whitehouse.gov/briefings-statement...,,"Grove City College Grove City, Pennsylvania TH...","Grove City College Grove City, Pennsylvania TH..."
1935,1975,President Trump and Prime Minister May’s Openi...,2017-01-27,https://www.whitehouse.gov/briefings-statement...,,President Donald J. Trump: “Thank you very muc...,President Donald J. Trump: “Thank you very muc...
1937,1977,The Inaugural Address,2017-01-20,https://www.whitehouse.gov/briefings-statement...,,"Chief Justice Roberts, President Carter, Presi...","Chief Justice Roberts, President Carter, Presi..."


In [8]:
briefings_first[briefings_first.speaker.isnull()]

Unnamed: 0,doc_id,title,date,link,total_words,first_letter_up,cap_first_by_total,speaker,text,original_text


In [9]:
drop_docs = remarks_first[remarks_first.speaker.isnull()].doc_id.tolist()

new_remarks['drop_docs'] = new_remarks.doc_id.map(lambda x: True if x in drop_docs else False)

new_remarks = new_remarks[new_remarks.drop_docs == False]

new_remarks.drop(['drop_docs'], axis=1, inplace=True)

new_remarks.reset_index(drop=True, inplace=True)

In [10]:
remarks_rows_change = new_remarks[(new_remarks.text.str.contains('[.?!]\s[A-Zc. ]+:'))].index.tolist()
briefings_rows_change = new_briefings[(new_briefings.text.str.contains('[.?!]\s[A-Zc. ]+:'))].index.tolist()
print(len(remarks_rows_change))
print(len(briefings_rows_change))

86
76


In [14]:
remarks_final = clean_wh.expand_table_mistakes(df=new_remarks, 
                                              rows_of_focus=remarks_rows_change)
briefings_final = clean_wh.expand_table_mistakes(df=new_briefings, 
                                              rows_of_focus=briefings_rows_change)

In [2]:
remarks_final = pd.read_csv('remarks-whitehouse-ready.csv')
briefings_final = pd.read_csv('briefings-whitehouse-ready.csv')

In [3]:
unique_remark_ids = remarks_final.doc_id.unique().tolist()
unique_briefing_ids = briefings_final.doc_id.unique().tolist()

new_remark_ids = {unique_remark_ids[idx]:idx for idx in range(len(unique_remark_ids))}

new_briefing_ids = {
    unique_briefing_ids[idx]:max(new_remark_ids.values())+idx+1 for idx in range(len(unique_briefing_ids))
}

remarks_final.doc_id.replace(new_remark_ids, inplace=True)
briefings_final.doc_id.replace(new_briefing_ids, inplace=True)

In [4]:
remarks_final.speaker = remarks_final.speaker.fillna(method='ffill')

briefings_final.speaker = briefings_final.speaker.fillna(method='ffill')

In [5]:
final_remarks = clean_wh.condense_table_by_speaker(remarks_final)
final_briefings = clean_wh.condense_table_by_speaker(briefings_final)

In [6]:
print(f'Remarks table before condensing speakers: {remarks_final.shape}')
print(f'Remarks table after condensing speakers: {final_remarks.shape}')

print(f'\nBriefings table before condensing speakers: {briefings_final.shape}')
print(f'Briefings table after condensing speakers: {final_briefings.shape}')

Remarks table before condensing speakers: (142832, 7)
Remarks table after condensing speakers: (56132, 6)

Briefings table before condensing speakers: (48409, 7)
Briefings table after condensing speakers: (30612, 6)


In [7]:
final_remarks.head(10)

Unnamed: 0,title,date,doc_id,speaker,text,link
0,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,0.0,MS MCENANY,I am here to deliver this message on behalf o...,https://www.whitehouse.gov/briefings-statement...
1,Remarks by Vice President Pence at the Space F...,2020-12-18,1.0,THE VICE PRESIDENT,"Well, thank you all for being here today. To ...",https://www.whitehouse.gov/briefings-statement...
2,Remarks by Vice President Pence at a Safe and ...,2020-12-18,2.0,THE VICE PRESIDENT,"Well, good morning. And thank you all for bei...",https://www.whitehouse.gov/briefings-statement...
3,Remarks by Vice President Pence at a Life is W...,2020-12-16,3.0,THE VICE PRESIDENT,Thank you. Thank you all. Thank you all very ...,https://www.whitehouse.gov/briefings-statement...
4,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,THE VICE PRESIDENT,"Well, let me — let me let — let me let people...",https://www.whitehouse.gov/briefings-statement...
5,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,DR REDFIELD,"Thank you very much, Mr. Vice President. And ...",https://www.whitehouse.gov/briefings-statement...
6,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,THE VICE PRESIDENT,"Well, thank you, Dr. Redfield. And before we ...",https://www.whitehouse.gov/briefings-statement...
7,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,DR REDFIELD,"Yes, sir.",https://www.whitehouse.gov/briefings-statement...
8,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,THE VICE PRESIDENT,— in terms of its ability to truly prevent th...,https://www.whitehouse.gov/briefings-statement...
9,Remarks by Vice President Pence at Roundtable ...,2020-12-10,5.0,THE VICE PRESIDENT,"Well, thank you very much, Governor McMaster....",https://www.whitehouse.gov/briefings-statement...


In [8]:
final_briefings.head(10)

Unnamed: 0,title,date,doc_id,speaker,text,link
0,Press Briefing by Press Secretary Kayleigh McE...,2020-12-16,1930.0,MS MCENANY,"Hello, everyone. Good afternoon. Yesterday, t...",https://www.whitehouse.gov/briefings-statement...
1,Press Briefing by Press Secretary Kayleigh McE...,2020-12-16,1930.0,PRESS CORPS,"Kayleigh, now that the Electoral College has ...",https://www.whitehouse.gov/briefings-statement...
2,Press Briefing by Press Secretary Kayleigh McE...,2020-12-16,1930.0,MS MCENANY,The President is still involved in ongoing li...,https://www.whitehouse.gov/briefings-statement...
3,Press Briefing by Press Secretary Kayleigh McE...,2020-12-16,1930.0,PRESS CORPS,"What was his reaction to Leader McConnell, to...",https://www.whitehouse.gov/briefings-statement...
4,Press Briefing by Press Secretary Kayleigh McE...,2020-12-16,1930.0,MS MCENANY,I haven’t gotten the President’s reaction to ...,https://www.whitehouse.gov/briefings-statement...
5,Press Briefing by Press Secretary Kayleigh McE...,2020-12-16,1930.0,PRESS CORPS,And what is the path forward though for litig...,https://www.whitehouse.gov/briefings-statement...
6,Press Briefing by Press Secretary Kayleigh McE...,2020-12-16,1930.0,MS MCENANY,The campaign would have more specifics for yo...,https://www.whitehouse.gov/briefings-statement...
7,Press Briefing by Press Secretary Kayleigh McE...,2020-12-16,1930.0,PRESS CORPS,Does the President plan to take the vaccine? ...,https://www.whitehouse.gov/briefings-statement...
8,Press Briefing by Press Secretary Kayleigh McE...,2020-12-16,1930.0,MS MCENANY,"So the President, currently at this moment, h...",https://www.whitehouse.gov/briefings-statement...
9,Press Briefing by Press Secretary Kayleigh McE...,2020-12-16,1930.0,PRESS CORPS,Wouldn’t him taking the vaccine set an exampl...,https://www.whitehouse.gov/briefings-statement...


In [9]:
whitehouse = pd.concat([final_remarks, final_briefings])
whitehouse.reset_index(drop=True, inplace=True)

In [10]:
whitehouse

Unnamed: 0,title,date,doc_id,speaker,text,link
0,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,0.0,MS MCENANY,I am here to deliver this message on behalf o...,https://www.whitehouse.gov/briefings-statement...
1,Remarks by Vice President Pence at the Space F...,2020-12-18,1.0,THE VICE PRESIDENT,"Well, thank you all for being here today. To ...",https://www.whitehouse.gov/briefings-statement...
2,Remarks by Vice President Pence at a Safe and ...,2020-12-18,2.0,THE VICE PRESIDENT,"Well, good morning. And thank you all for bei...",https://www.whitehouse.gov/briefings-statement...
3,Remarks by Vice President Pence at a Life is W...,2020-12-16,3.0,THE VICE PRESIDENT,Thank you. Thank you all. Thank you all very ...,https://www.whitehouse.gov/briefings-statement...
4,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,THE VICE PRESIDENT,"Well, let me — let me let — let me let people...",https://www.whitehouse.gov/briefings-statement...
5,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,DR REDFIELD,"Thank you very much, Mr. Vice President. And ...",https://www.whitehouse.gov/briefings-statement...
6,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,THE VICE PRESIDENT,"Well, thank you, Dr. Redfield. And before we ...",https://www.whitehouse.gov/briefings-statement...
7,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,DR REDFIELD,"Yes, sir.",https://www.whitehouse.gov/briefings-statement...
8,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,THE VICE PRESIDENT,— in terms of its ability to truly prevent th...,https://www.whitehouse.gov/briefings-statement...
9,Remarks by Vice President Pence at Roundtable ...,2020-12-10,5.0,THE VICE PRESIDENT,"Well, thank you very much, Governor McMaster....",https://www.whitehouse.gov/briefings-statement...


In [12]:
whitehouse.to_csv('whitehouse-for-preprocessing.csv', index=False)