In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import re
import clean_preprocess_white_house as cpp_wh
from collections import defaultdict

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])

from textblob import TextBlob, Word

pd.set_option('mode.chained_assignment', None)

ImportError: cannot import name 'MapWrapper'

In [26]:
def lemmatize_with_postag_wordblob(text):
    sent = TextBlob(text)
    tag_dict = {'J':'a', 
                'N':'n', 
                'V':'v',
                'R':'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n') for w, pos in sent.tags)]
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return lemmatized_list

SyntaxError: invalid syntax (<ipython-input-26-061961052581>, line 7)

In [2]:
whitehouse = pd.read_csv('whitehouse-for-preprocessing.csv')
print(whitehouse.shape)

(86744, 6)


In [3]:
whitehouse.head()

Unnamed: 0,title,date,doc_id,speaker,text,link
0,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,0.0,MS MCENANY,I am here to deliver this message on behalf o...,https://www.whitehouse.gov/briefings-statement...
1,Remarks by Vice President Pence at the Space F...,2020-12-18,1.0,THE VICE PRESIDENT,"Well, thank you all for being here today. To ...",https://www.whitehouse.gov/briefings-statement...
2,Remarks by Vice President Pence at a Safe and ...,2020-12-18,2.0,THE VICE PRESIDENT,"Well, good morning. And thank you all for bei...",https://www.whitehouse.gov/briefings-statement...
3,Remarks by Vice President Pence at a Life is W...,2020-12-16,3.0,THE VICE PRESIDENT,Thank you. Thank you all. Thank you all very ...,https://www.whitehouse.gov/briefings-statement...
4,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,THE VICE PRESIDENT,"Well, let me — let me let — let me let people...",https://www.whitehouse.gov/briefings-statement...


In [4]:
whitehouse.date = pd.to_datetime(whitehouse.date)

In [5]:
whitehouse.isnull().sum()

title      0
date       0
doc_id     0
speaker    0
text       0
link       0
dtype: int64

# Cleaning speakers

The last bit of cleaning that I'm going to do is clean up the speakers column. There are speakers name's repeated in different forms that will be cleaned up. I also tried to organize some of the names the are "Mr/Ms/Mrs (name)" but didn't get to organize them completely for the sake of time. 

In [6]:
whitehouse.speaker = whitehouse.speaker.str.strip()
whitehouse.text = whitehouse.text.str.strip()

In [7]:
len(whitehouse.speaker.unique())

2496

In [8]:
# changing repeating names to one name so that all instances of a speaker are under one name
whitehouse.speaker = cpp_wh.conslidate_speaker_names(whitehouse.speaker)

In [9]:
whitehouse = whitehouse[whitehouse.speaker != 'CLARIFICATION']

whitehouse.reset_index(drop=True, inplace=True)

In [10]:
len(whitehouse.speaker.unique())

1378

In [11]:
whitehouse.head()

Unnamed: 0,title,date,doc_id,speaker,text,link
0,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,0.0,KAYLEIGH MCENANY,I am here to deliver this message on behalf of...,https://www.whitehouse.gov/briefings-statement...
1,Remarks by Vice President Pence at the Space F...,2020-12-18,1.0,MIKE PENCE,"Well, thank you all for being here today. To A...",https://www.whitehouse.gov/briefings-statement...
2,Remarks by Vice President Pence at a Safe and ...,2020-12-18,2.0,MIKE PENCE,"Well, good morning. And thank you all for bein...",https://www.whitehouse.gov/briefings-statement...
3,Remarks by Vice President Pence at a Life is W...,2020-12-16,3.0,MIKE PENCE,Thank you. Thank you all. Thank you all very m...,https://www.whitehouse.gov/briefings-statement...
4,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,MIKE PENCE,"Well, let me — let me let — let me let people ...",https://www.whitehouse.gov/briefings-statement...


In [12]:
# so got the number of speakers down by over 1000, 
# could work on making that even smaller at a later time
whitehouse.speaker.value_counts().head(15)

PRESS CORPS                        28512
DONALD TRUMP                       26190
SARAH SANDERS                       5177
SEAN SPICER                         3427
SENIOR ADMINISTRATION OFFICIALS     3416
MIKE PENCE                          2561
KAYLEIGH MCENANY                    1682
FOREIGN GOVERNMENT LEADERS          1225
AUDIENCE                            1217
MILITARY FIGURE                      956
CONGRESSPERSONS                      756
STATE SENATORS                       676
MILITARY FIGURES                     632
DOCTOR, MEDICAL OR PHD               523
FOREIGN AMBASSADORS                  421
Name: speaker, dtype: int64

In [13]:
whitehouse[whitehouse.text == ''].shape

(699, 6)

In [14]:
whitehouse.text.replace('', np.nan, inplace=True)

whitehouse.isnull().sum()

title        0
date         0
doc_id       0
speaker      0
text       699
link         0
dtype: int64

In [15]:
whitehouse.dropna(subset=['text'], inplace=True)
whitehouse.reset_index(drop=True, inplace=True)

# Preprocessing text

In [16]:
whitehouse.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86044 entries, 0 to 86043
Data columns (total 6 columns):
title      86044 non-null object
date       86044 non-null datetime64[ns]
doc_id     86044 non-null float64
speaker    86044 non-null object
text       86044 non-null object
link       86044 non-null object
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 3.9+ MB


In [17]:
whitehouse.shape

(86044, 6)

In [18]:
whitehouse.tail()

Unnamed: 0,title,date,doc_id,speaker,text,link
86039,Press Briefing by Press Secretary Sean Spicer,2017-02-01,2275.0,PRESS CORPS,The request from Democrats to investigate Flyn...,https://www.whitehouse.gov/briefings-statement...
86040,Press Briefing by Press Secretary Sean Spicer,2017-02-01,2275.0,SEAN SPICER,"Yeah, so General Flynn, like I think probably ...",https://www.whitehouse.gov/briefings-statement...
86041,Press Briefing by Press Secretary Sean Spicer,2017-02-01,2275.0,PRESS CORPS,"Thanks, Sean. General Flynn, when he was up he...",https://www.whitehouse.gov/briefings-statement...
86042,Press Briefing by Press Secretary Sean Spicer,2017-02-01,2275.0,SEAN SPICER,I think in area where there’s going to be a sh...,https://www.whitehouse.gov/briefings-statement...
86043,Statement by Press Secretary Sean Spicer,2017-01-21,2276.0,SEAN SPICER,Good evening. Thank you guys for coming. I kno...,https://www.whitehouse.gov/briefings-statement...


In [19]:
# tokenizing texts 

whitehouse['text_tokens'] = whitehouse.text.map(cpp_wh.tokenize_data)

In [20]:
whitehouse.head()

Unnamed: 0,title,date,doc_id,speaker,text,link,text_tokens
0,Remarks by Press Secretary Kayleigh McEnany,2021-01-07,0.0,KAYLEIGH MCENANY,I am here to deliver this message on behalf of...,https://www.whitehouse.gov/briefings-statement...,"[here, deliver, this, message, behalf, the, en..."
1,Remarks by Vice President Pence at the Space F...,2020-12-18,1.0,MIKE PENCE,"Well, thank you all for being here today. To A...",https://www.whitehouse.gov/briefings-statement...,"[well, thank, you, all, for, being, here, toda..."
2,Remarks by Vice President Pence at a Safe and ...,2020-12-18,2.0,MIKE PENCE,"Well, good morning. And thank you all for bein...",https://www.whitehouse.gov/briefings-statement...,"[well, good, morning, and, thank, you, all, fo..."
3,Remarks by Vice President Pence at a Life is W...,2020-12-16,3.0,MIKE PENCE,Thank you. Thank you all. Thank you all very m...,https://www.whitehouse.gov/briefings-statement...,"[thank, you, thank, you, all, thank, you, all,..."
4,Remarks by Vice President Pence at a Roundtabl...,2020-12-15,4.0,MIKE PENCE,"Well, let me — let me let — let me let people ...",https://www.whitehouse.gov/briefings-statement...,"[well, let, let, let, let, let, people, hear, ..."


In [21]:
# going to test a few different lemmatizers to see which I prefer