# Political Ads EDA

In [1]:
import pprint
import pickle
import pandas as pd
from datetime import datetime
import numpy as np
from datetime import datetime
import string
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 2000)
pd.set_option("display.max_rows", 2000)

In [4]:
# reading in data

all_df = pd.read_csv('data/political_ads.csv', parse_dates=['start_time', 'end_time', 'date_created'])

In [5]:
all_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32


In [6]:
# saving as a pickle object to retain datetime object

all_df.to_pickle('data/all_political_ads.pickle')

# to read in a pickled file [e.g., var_name = pd.read_pickle('data/FILENAME.pickle')]

In [7]:
all_df.dtypes

id                        int64
wp_identifier             int64
network                  object
location                 object
program                  object
program_type             object
start_time       datetime64[ns]
end_time         datetime64[ns]
archive_id               object
embed_url                object
sponsors                 object
sponsor_types            object
race                     object
cycle                   float64
subjects                 object
candidates               object
type                     object
message                  object
date_created     datetime64[ns]
dtype: object

In [8]:
all_df.shape

(364718, 19)

In [9]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364718 entries, 0 to 364717
Data columns (total 19 columns):
id               364718 non-null int64
wp_identifier    364718 non-null int64
network          364718 non-null object
location         364693 non-null object
program          364704 non-null object
program_type     364718 non-null object
start_time       364718 non-null datetime64[ns]
end_time         364718 non-null datetime64[ns]
archive_id       364718 non-null object
embed_url        364718 non-null object
sponsors         363556 non-null object
sponsor_types    363556 non-null object
race             334846 non-null object
cycle            335617 non-null float64
subjects         343932 non-null object
candidates       339591 non-null object
type             364718 non-null object
message          364718 non-null object
date_created     364718 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(2), object(13)
memory usage: 52.9+ MB


In [10]:
all_df.describe()

Unnamed: 0,id,wp_identifier,cycle
count,364718.0,364718.0,335617.0
mean,237070.526862,4078.856336,2016.0
std,139393.64793,2709.646714,0.0
min,1.0,232.0,2016.0
25%,114419.25,1213.0,2016.0
50%,222862.5,4685.0,2016.0
75%,372456.75,6714.0,2016.0
max,464741.0,8404.0,2016.0


In [11]:
all_df.columns

Index(['id', 'wp_identifier', 'network', 'location', 'program', 'program_type',
       'start_time', 'end_time', 'archive_id', 'embed_url', 'sponsors',
       'sponsor_types', 'race', 'cycle', 'subjects', 'candidates', 'type',
       'message', 'date_created'],
      dtype='object')

In [12]:
# number of unique items per feature

for column in all_df.columns:
    print ("{:<20s} {:>6.0f}".format(column, all_df[column].nunique()))
    
# review print how to get the numbers lined up -- see Course-Syllabus 9.6

id                   364718
wp_identifier          1304
network                 130
location                 25
program                3124
program_type              2
start_time           352660
end_time             352938
archive_id             1304
embed_url              1304
sponsors                197
sponsor_types            11
race                     28
cycle                     1
subjects                916
candidates              149
type                      2
message                   5
date_created          17705


- from unique values, looks like database has ads for all races not just presidential

In [13]:
unique_categories = ['network',
                     'location',
                     'program_type',
                     'sponsor_types',
                     'race',
                     'candidates',
                     'type',
                     'message'
                    ]

In [14]:
for category in unique_categories:
    print (category, '\n')
    print (all_df[category].unique())
    print ('\n')

network 

['KLAS' 'WNCN' 'WFLA' 'KSNV' 'WTVT' 'WTOG' 'WLFL' 'WOIO' 'WRAL' 'WEWS'
 'WRAZ' 'KVVU' 'KCRG' 'WFTS' 'KFXA' 'KGAN' 'KTNV' 'WTXF' 'WJW' 'WKYC'
 'WCAU' 'KYW' 'WPVI' 'KUSA' 'WTMJ' 'KNTV' 'KOFY' 'CNNW' 'MSNBCW' 'KPNX'
 'WHDH' 'FOXNEWSW' 'CSPAN' 'WMUR' 'WBZ' 'WFXT' 'WCVB' 'KQED' 'FBC' 'KMTP'
 'KRON' 'KTVU' 'KSAZ' 'WISN' 'KGO' 'KNXV' 'KPHO' 'LINKTV' 'WITI' 'WDJT'
 'CNBC' 'KMGH' 'KCNC' 'WUSA' 'KDVR' 'WTTG' 'CSPAN2' 'CSPAN3' 'WJLA'
 'BLOOMBERG' 'KCSM' 'KPIX' 'WKMG' 'ALJAZAM' 'KCAU' 'COM' 'WABC' 'KDTV'
 'WNYW' 'KSTS' 'WRC' 'KTSF' 'WNBC' 'KOLO' 'WUVP' 'WCBS' 'KMEG' 'KTVN'
 'WKRC' 'WHO' 'KRNV' 'WVEC' 'WOI' 'WSPA' 'WXIX' 'WCPO' 'WLWT' 'WAVY' 'WVBT'
 'WTKR' 'WHNS' 'WYFF' 'KTIV' 'KWWL' 'KCCI' 'WLOS' 'KRXI' 'KDSM' 'KPTH'
 'KRDO' 'WPLG' 'WCNC' 'WTVJ' 'FRANCE24' 'WSVN' 'WBTV' 'WTSP' 'WIS' 'WFTV'
 'WLTX' 'WTVD' 'WOLO' 'WSET' 'WOFL' 'WSOC' 'WFOR' 'WSLS' 'WACH' 'WESH'
 'WJZY' 'WDBJ' 'WFXR' 'KKTV' 'KOAA' 'KXRM' 'KQEH' 'BBCAMERICA' 'KCNS'
 'BETW' 'BBCNEWS']


location 

['Las Vegas, NV' 'Raleigh-Du

In [15]:
all_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32


### Coding Plan I

- filter data set down to ads after 8/1/2016
- filter ads only to presidential race

In [16]:
# using a copy to be safe

filter_df = all_df.copy()
filter_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32


In [17]:
# filter out all all dates before 8/1/2016

filter_df = filter_df[filter_df['start_time'] >= datetime(2016, 8, 1, 0, 0)]
filter_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32


In [18]:
filter_df.shape

(201445, 19)

In [19]:
filter_df['start_time'].min(), filter_df['start_time'].max() 

# odd that they had ads after election day (runoff elections e.g., Louisiana) -- will filter out

(Timestamp('2016-08-01 00:35:22'), Timestamp('2016-11-29 18:44:37'))

In [20]:
filter_df = filter_df[filter_df['start_time'] <= datetime(2016, 11, 8, 23, 59, 59)]
filter_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32


In [21]:
filter_df.shape

(201283, 19)

In [22]:
filter_df['start_time'].min(), filter_df['start_time'].max() 

(Timestamp('2016-08-01 00:35:22'), Timestamp('2016-11-08 23:58:46'))

In [23]:
# now filter down to the presidential race only

filter_df = filter_df[filter_df['race'] == 'PRES']
filter_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32


In [24]:
# confirming that mask worked

filter_df['race'].unique()

array(['PRES'], dtype=object)

In [25]:
filter_df.shape

(85152, 19)

### Coding Plan 2

- create `pres_df` and pickle from `filter_df` copy
- new features: `ad_length`, `time_from_creation` (in days)
- see if can create separate `metro_area` and `state` features by splitting `location`
- `day_of_week` feature from ad air time stamp
- note potential gotcha on timestamps -- are times recorded UTC, local time zone
- groupbys

In [26]:
raw_pres_df = filter_df.copy()

In [28]:
raw_pres_df.to_pickle('data/pres_raw_data.pickle')

In [29]:
pres_df = raw_pres_df.copy()

In [30]:
pres_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32


In [31]:
# creating 'length_of_ad' feature

pres_df['ad_length_tmp'] = pres_df['end_time'] - pres_df['start_time']
pres_df['ad_length_tmp'].head(10)

0   00:00:30
1   00:00:30
2   00:00:30
3   00:00:30
4   00:00:30
5   00:00:30
6   00:00:30
7   00:00:30
8   00:00:30
9   00:00:30
Name: ad_length_tmp, dtype: timedelta64[ns]

In [32]:
# converting to seconds for readability

pres_df['ad_length_tmp'] = pres_df['ad_length_tmp'] / np.timedelta64(1, 's')
pres_df['ad_length_tmp'].head(10)

0    30.0
1    30.0
2    30.0
3    30.0
4    30.0
5    30.0
6    30.0
7    30.0
8    30.0
9    30.0
Name: ad_length_tmp, dtype: float64

In [33]:
# satisfied with conversion - make tmp column permanent - just practicing drop -- could have just renamed column

pres_df['ad_length'] = pres_df['ad_length_tmp']

In [34]:
pres_df.drop('ad_length_tmp', inplace=True, axis=1)
pres_df.head()

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created,ad_length
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0
2,3,5643,KLAS,"Las Vegas, NV",Face the Nation,news,2016-09-04 16:24:25,2016-09-04 16:24:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0
3,4,5643,WFLA,"Tampa-St. Petersburg, FL",Days of Our Lives,not news,2016-08-30 17:59:20,2016-08-30 17:59:50,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0
4,5,5643,KSNV,"Las Vegas, NV",The Tonight Show Starring Jimmy Fallon,news,2016-09-06 07:02:22,2016-09-06 07:02:52,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0


In [35]:
pres_df['ad_length'].describe()

count    85152.000000
mean        33.193419
std         11.203826
min         15.000000
25%         30.000000
50%         30.000000
75%         30.000000
max        184.000000
Name: ad_length, dtype: float64

In [36]:
# creating 'time_from_creation' feature (days)

pres_df['time_from_creation_tmp'] = pres_df['start_time'] - pres_df['date_created']
pres_df['time_from_creation_tmp'].head(10)

0    -4 days +09:23:27
1    -6 days +07:08:53
2    -8 days +01:34:53
3   -13 days +03:09:48
4    -7 days +16:12:50
5   -13 days +05:05:42
6   -14 days +22:32:58
7    -6 days +09:55:28
8   -13 days +07:23:27
9   -14 days +15:03:29
Name: time_from_creation_tmp, dtype: timedelta64[ns]

In [37]:
pres_df['time_from_creation_tmp'].tail(10)

364039   -21 days +17:33:53
364040   -27 days +23:02:53
364041   -20 days +00:04:06
364042   -27 days +18:48:57
364043   -27 days +07:47:33
364044   -25 days +17:18:46
364045   -21 days +18:32:59
364046   -20 days +06:41:44
364183   -21 days +05:24:29
364184   -21 days +12:40:50
Name: time_from_creation_tmp, dtype: timedelta64[ns]

In [38]:
# this feature does not seem to make sense, perhaps 'date_created' is the date the ad was created - not when it was
# literally created -- therefore going to to drop this created column

pres_df.drop('time_from_creation_tmp', inplace=True, axis=1)
pres_df.head()

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created,ad_length
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0
2,3,5643,KLAS,"Las Vegas, NV",Face the Nation,news,2016-09-04 16:24:25,2016-09-04 16:24:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0
3,4,5643,WFLA,"Tampa-St. Petersburg, FL",Days of Our Lives,not news,2016-08-30 17:59:20,2016-08-30 17:59:50,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0
4,5,5643,KSNV,"Las Vegas, NV",The Tonight Show Starring Jimmy Fallon,news,2016-09-06 07:02:22,2016-09-06 07:02:52,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0


In [39]:
# exploring the unique categories again

unique_categories = ['network',
                     'location',
                     'archive_id',
                     'program_type',
                     'sponsor_types',
                     'candidates',
                     'type',
                     'message'
                    ]

In [40]:
for category in unique_categories:
    print (category, '\n')
    print (pres_df[category].unique())
    print ('\n')

network 

['KLAS' 'WNCN' 'WFLA' 'KSNV' 'WTVT' 'WTOG' 'WLFL' 'WOIO' 'WRAL' 'WEWS'
 'WRAZ' 'KVVU' 'KCRG' 'WFTS' 'KFXA' 'KGAN' 'KTNV' 'WTXF' 'WJW' 'WKYC'
 'WCAU' 'KYW' 'WPVI' 'KUSA' 'WTMJ' 'KNTV' 'KOFY' 'CNNW' 'MSNBCW' 'KPNX'
 'WHDH' 'FOXNEWSW' 'CSPAN' 'FBC' 'KMTP' 'KQED' 'KRON' 'WFXT' 'KTVU' 'KSAZ'
 'WISN' 'KGO' 'WCVB' 'LINKTV' 'KMGH' 'KCNC' 'WUSA' 'KDVR' 'WTTG' 'WMUR'
 'CSPAN2' 'CSPAN3' 'WJLA' 'BLOOMBERG' 'KCSM' 'KNXV' 'COM' 'WBZ' 'WDJT'
 'WABC' 'WNYW' 'WRC' 'KTSF' 'WNBC' 'WITI' 'KPIX' 'CNBC' 'KPHO' 'WCBS'
 'KDTV' 'KSTS' 'WUVP' 'KQEH' 'KCNS' 'KWWL' 'BETW' 'WTSP' 'BBCNEWS']


location 

['Las Vegas, NV' 'Raleigh-Durham-Fayetteville,  NC'
 'Tampa-St. Petersburg, FL' 'Cleveland, Ohio'
 'Ceder Rapids-Waterloo-Iowa City-Dublin, Iowa' 'Philadelphia, PA'
 'Denver, CO' 'Milwaukee, WI' 'San Francisco-Oakland-San Jose, CA'
 'Phoenix-Prescott, AZ' 'Boston, MA/Manchester, NH'
 'Washington, DC/Hagerstown, MD' 'New York City, NY' nan]


archive_id 

['PolAd_HillaryClinton_f1h3j' 'PolAd_DonaldTrump_42

In [41]:
# creating 'day_of_week' - day of week that ad aired

pres_df['day_of_week'] = pres_df['start_time'].map(lambda x: x.strftime("%A"))
pres_df.head(10)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created,ad_length,day_of_week
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Friday
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday
2,3,5643,KLAS,"Las Vegas, NV",Face the Nation,news,2016-09-04 16:24:25,2016-09-04 16:24:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Sunday
3,4,5643,WFLA,"Tampa-St. Petersburg, FL",Days of Our Lives,not news,2016-08-30 17:59:20,2016-08-30 17:59:50,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday
4,5,5643,KSNV,"Las Vegas, NV",The Tonight Show Starring Jimmy Fallon,news,2016-09-06 07:02:22,2016-09-06 07:02:52,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday
5,6,5643,WTVT,"Tampa-St. Petersburg, FL",TMZ Live,not news,2016-08-30 19:55:14,2016-08-30 19:55:44,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday
6,7,5643,WTOG,"Tampa-St. Petersburg, FL",Justice for All With Cristina Perez,not news,2016-08-30 13:22:30,2016-08-30 13:23:00,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday
7,8,5643,WLFL,"Raleigh-Durham-Fayetteville, NC",The Flash,not news,2016-09-07 00:45:00,2016-09-07 00:45:30,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Wednesday
8,9,5643,WOIO,"Cleveland, Ohio",Cleveland 19 News at 6PM,news,2016-08-30 22:12:59,2016-08-30 22:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday
9,10,5643,WRAL,"Raleigh-Durham-Fayetteville, NC",Last Call With Carson Daly,not news,2016-08-30 05:53:01,2016-08-30 05:53:31,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday


In [42]:
# now creating 'metro_area' and 'state' features

# testing proof of concept -- going to split 'location' and then access separately

pres_df['location'][0].split(',')

['Las Vegas', ' NV']

In [43]:
# going to have an issue because there are NaN's -- cannot split NaN's -- going to get an error

pres_df['location'].unique()

array(['Las Vegas, NV', 'Raleigh-Durham-Fayetteville,  NC',
       'Tampa-St. Petersburg, FL', 'Cleveland, Ohio',
       'Ceder Rapids-Waterloo-Iowa City-Dublin, Iowa', 'Philadelphia, PA',
       'Denver, CO', 'Milwaukee, WI', 'San Francisco-Oakland-San Jose, CA',
       'Phoenix-Prescott, AZ', 'Boston, MA/Manchester, NH',
       'Washington, DC/Hagerstown, MD', 'New York City, NY', nan], dtype=object)

In [44]:
# How many NaN's -- pretty good case to drop observations with location NaN's as only 25 out of 85K observations

pres_df['location'].isnull().sum(), pres_df.shape

(25, (85152, 21))

In [45]:
# dropping rows where 'location' NaNs

pres_df.dropna(subset=['location'], inplace=True)
pres_df.shape

(85127, 21)

In [46]:
# confirming removal of NaNs in 'location'

pres_df['location'].unique()

array(['Las Vegas, NV', 'Raleigh-Durham-Fayetteville,  NC',
       'Tampa-St. Petersburg, FL', 'Cleveland, Ohio',
       'Ceder Rapids-Waterloo-Iowa City-Dublin, Iowa', 'Philadelphia, PA',
       'Denver, CO', 'Milwaukee, WI', 'San Francisco-Oakland-San Jose, CA',
       'Phoenix-Prescott, AZ', 'Boston, MA/Manchester, NH',
       'Washington, DC/Hagerstown, MD', 'New York City, NY'], dtype=object)

In [47]:
pres_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created,ad_length,day_of_week
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Friday
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday


In [48]:
# now creating 'split_location_tmp'

pres_df['split_location_tmp'] = pres_df['location'].map(lambda x: x.split(','))
pres_df.head(10)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created,ad_length,day_of_week,split_location_tmp
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Friday,"[Las Vegas, NV]"
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,"[Raleigh-Durham-Fayetteville, NC]"
2,3,5643,KLAS,"Las Vegas, NV",Face the Nation,news,2016-09-04 16:24:25,2016-09-04 16:24:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Sunday,"[Las Vegas, NV]"
3,4,5643,WFLA,"Tampa-St. Petersburg, FL",Days of Our Lives,not news,2016-08-30 17:59:20,2016-08-30 17:59:50,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,"[Tampa-St. Petersburg, FL]"
4,5,5643,KSNV,"Las Vegas, NV",The Tonight Show Starring Jimmy Fallon,news,2016-09-06 07:02:22,2016-09-06 07:02:52,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,"[Las Vegas, NV]"
5,6,5643,WTVT,"Tampa-St. Petersburg, FL",TMZ Live,not news,2016-08-30 19:55:14,2016-08-30 19:55:44,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,"[Tampa-St. Petersburg, FL]"
6,7,5643,WTOG,"Tampa-St. Petersburg, FL",Justice for All With Cristina Perez,not news,2016-08-30 13:22:30,2016-08-30 13:23:00,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,"[Tampa-St. Petersburg, FL]"
7,8,5643,WLFL,"Raleigh-Durham-Fayetteville, NC",The Flash,not news,2016-09-07 00:45:00,2016-09-07 00:45:30,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Wednesday,"[Raleigh-Durham-Fayetteville, NC]"
8,9,5643,WOIO,"Cleveland, Ohio",Cleveland 19 News at 6PM,news,2016-08-30 22:12:59,2016-08-30 22:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,"[Cleveland, Ohio]"
9,10,5643,WRAL,"Raleigh-Durham-Fayetteville, NC",Last Call With Carson Daly,not news,2016-08-30 05:53:01,2016-08-30 05:53:31,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,"[Raleigh-Durham-Fayetteville, NC]"


In [49]:
# now creating 'metro_area'

pres_df['metro_area'] = pres_df['split_location_tmp'].map(lambda x: x[0])

In [50]:
# confirming work

pres_df['metro_area'].head()

0                      Las Vegas
1    Raleigh-Durham-Fayetteville
2                      Las Vegas
3           Tampa-St. Petersburg
4                      Las Vegas
Name: metro_area, dtype: object

In [51]:
pres_df['metro_area'].tail()

364042    Cleveland
364043    Cleveland
364044    Cleveland
364045    Cleveland
364046    Cleveland
Name: metro_area, dtype: object

In [52]:
pres_df['metro_area'].unique()

array(['Las Vegas', 'Raleigh-Durham-Fayetteville', 'Tampa-St. Petersburg',
       'Cleveland', 'Ceder Rapids-Waterloo-Iowa City-Dublin',
       'Philadelphia', 'Denver', 'Milwaukee',
       'San Francisco-Oakland-San Jose', 'Phoenix-Prescott', 'Boston',
       'Washington', 'New York City'], dtype=object)

In [53]:
pres_df['metro_area'].value_counts()

Tampa-St. Petersburg                      13248
Philadelphia                              11949
Las Vegas                                 10953
Cleveland                                 10604
San Francisco-Oakland-San Jose             7546
Raleigh-Durham-Fayetteville                6731
Ceder Rapids-Waterloo-Iowa City-Dublin     5295
Boston                                     5008
Denver                                     4710
Milwaukee                                  4097
Phoenix-Prescott                           3244
Washington                                 1695
New York City                                47
Name: metro_area, dtype: int64

In [54]:
# using the same methodology, creating 'state' feature

pres_df['state'] = pres_df['split_location_tmp'].map(lambda x: x[1])

In [55]:
# confirming work

pres_df['state'].head()

0      NV
1      NC
2      NV
3      FL
4      NV
Name: state, dtype: object

In [56]:
# confirming work

pres_df['state'].tail()

364042     Ohio
364043     Ohio
364044     Ohio
364045     Ohio
364046     Ohio
Name: state, dtype: object

In [57]:
# confirming work

pres_df['state'].unique()

array([' NV', '  NC', ' FL', ' Ohio', ' Iowa', ' PA', ' CO', ' WI', ' CA',
       ' AZ', ' MA/Manchester', ' DC/Hagerstown', ' NY'], dtype=object)

In [58]:
# a little cleanup may be required here using some domain knowledge

# Boston media market most likely used to target New Hampshire voters -- convert "MA/Manchester" to New Hampshire

# DC/Hagerstown media buys used to influence Virgina state race

# could ignore and go with current labels (as we know what they are), but going to change

# while at it, could change CA to "National" as they were network buys (San Francisco Market)

# a little uncertainty over why they were looking closely at NY -- could this be targeted at PA??

# there is a case for dropping the NY observations as they are only 47 out of 85K

# Going to do it for pandas practice


In [59]:
pres_df['state'] = pres_df['state'].str.replace(' MA/Manchester', 'NH')
pres_df['state'] = pres_df['state'].str.replace(' DC/Hagerstown', 'VA')
pres_df['state'] = pres_df['state'].str.replace(' Ohio', 'OH')
pres_df['state'] = pres_df['state'].str.replace(' Iowa', 'IA')
pres_df['state'] = pres_df['state'].str.replace(' CA', 'National')

In [60]:
# getting rid of unnecessary whitespace in the labels - could be a gotcha later on

pres_df['state'] = pres_df['state'].map(lambda x: x.strip())

In [61]:
pres_df['state'].value_counts()

FL          13248
PA          11949
NV          10953
OH          10604
National     7546
NC           6731
IA           5295
NH           5008
CO           4710
WI           4097
AZ           3244
VA           1695
NY             47
Name: state, dtype: int64

In [62]:
pres_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created,ad_length,day_of_week,split_location_tmp,metro_area,state
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Friday,"[Las Vegas, NV]",Las Vegas,NV
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,"[Raleigh-Durham-Fayetteville, NC]",Raleigh-Durham-Fayetteville,NC


In [63]:
# dropping the 'split_location_tmp' column

pres_df.drop('split_location_tmp', inplace=True, axis=1)
pres_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created,ad_length,day_of_week,metro_area,state
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Friday,Las Vegas,NV
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,Raleigh-Durham-Fayetteville,NC


In [64]:
pres_df.shape

(85127, 23)

In [65]:
# looking at number of subjects per ad -- going to split and use same methodology as 'location' to 'metro/state'

# proof of concept

len(pres_df['subjects'][0].split())

3

In [66]:
# are there any NaN's

pres_df['subjects'].isnull().sum()

591

In [67]:
pd.isnull(pres_df['subjects'])

0         False
1         False
2         False
3         False
4         False
5         False
6         False
7         False
8         False
9         False
10        False
11        False
12        False
13        False
14        False
15        False
16        False
17        False
18        False
19        False
20        False
21        False
22        False
23        False
24        False
25        False
26        False
27        False
28        False
29        False
30        False
31        False
32        False
33        False
34        False
35        False
36        False
37        False
38        False
39        False
40        False
41        False
42        False
43        False
44        False
45        False
46        False
47        False
48        False
49        False
50        False
51        False
52        False
53        False
54        False
55        False
56        False
57        False
58        False
59        False
60        False
61        False
62      

In [68]:
pres_df.ix[362865] # how to get row info by index number

id                                                          462751
wp_identifier                                                 8248
network                                                       WUVP
location                                          Philadelphia, PA
program                                          Despierta America
program_type                                                  news
start_time                                     2016-10-19 12:33:03
end_time                                       2016-10-19 12:34:18
archive_id                              PolAd_HillaryClinton_ia316
embed_url        https://archive.org/embed/PolAd_HillaryClinton...
sponsors                                        America Rising PAC
sponsor_types                                            Super PAC
race                                                          PRES
cycle                                                         2016
subjects                                                      

In [69]:
pres_df['subjects'].nunique() # unique combinations of ad topics

212

In [70]:
pres_df['subjects'].describe()

count                                    84536
unique                                     212
top       Women, Candidate Biography, Children
freq                                      4184
Name: subjects, dtype: object

In [71]:
def subject_count(cell):
    try:
        s_count = len((cell).split(','))
    except AttributeError:
        return np.nan
    return s_count

# could have used this pattern from asynch

#if raw_time is np.nan:
        #return np.nan -- if it's NaN, leave it
    
# there is a gotcha that np.nan == np.nan returns FALSE, but if you use 'is np.nan' works, or try and except

In [72]:
pres_df['subject_count_tmp'] = pres_df['subjects'].apply(subject_count) # can use apply instead of map
pres_df.head()

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created,ad_length,day_of_week,metro_area,state,subject_count_tmp
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Friday,Las Vegas,NV,3.0
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,Raleigh-Durham-Fayetteville,NC,3.0
2,3,5643,KLAS,"Las Vegas, NV",Face the Nation,news,2016-09-04 16:24:25,2016-09-04 16:24:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Sunday,Las Vegas,NV,3.0
3,4,5643,WFLA,"Tampa-St. Petersburg, FL",Days of Our Lives,not news,2016-08-30 17:59:20,2016-08-30 17:59:50,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,Tampa-St. Petersburg,FL,3.0
4,5,5643,KSNV,"Las Vegas, NV",The Tonight Show Starring Jimmy Fallon,news,2016-09-06 07:02:22,2016-09-06 07:02:52,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,Las Vegas,NV,3.0


In [73]:
pres_df['subject_count_tmp'].value_counts(dropna=False)

# this is a gotcha - have to put dropna=False in order to get the NaN's in value_counts
# want to keep the NaN's b/c can do math on the series (if convert NaNs to strings cannot do so)

 3.0    20787
 4.0    19869
 2.0    15428
 5.0    11875
 6.0     5942
 1.0     4716
 7.0     3827
 8.0     2092
NaN       591
Name: subject_count_tmp, dtype: int64

In [74]:
type(pres_df['subject_count_tmp'][0])

numpy.float64

In [75]:
pres_df['subject_count_tmp'].mean(), pres_df['subject_count_tmp'].median()

(3.7375910854547176, 4.0)

In [76]:
pres_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created,ad_length,day_of_week,metro_area,state,subject_count_tmp
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Friday,Las Vegas,NV,3.0
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,Raleigh-Durham-Fayetteville,NC,3.0


In [77]:
pres_df.shape

(85127, 24)

In [78]:
# since happy with subject_count - rename the column - get rid of the tmp tag

pres_df = pres_df.rename(columns={'subject_count_tmp': 'subject_count'})

In [79]:
pres_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created,ad_length,day_of_week,metro_area,state,subject_count
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Friday,Las Vegas,NV,3.0
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,Raleigh-Durham-Fayetteville,NC,3.0


In [80]:
# can also rename columns inplace

pres_df.rename(columns={'subject_count': 'subject_count_test'}, inplace=True)

In [81]:
pres_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created,ad_length,day_of_week,metro_area,state,subject_count_test
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Friday,Las Vegas,NV,3.0
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,Raleigh-Durham-Fayetteville,NC,3.0


In [82]:
pres_df.rename(columns={'subject_count_test': 'subject_count'}, inplace=True)

In [83]:
pres_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created,ad_length,day_of_week,metro_area,state,subject_count
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59,2016-09-09 00:13:29,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Friday,Las Vegas,NV,3.0
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25,2016-09-06 21:58:55,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32,30.0,Tuesday,Raleigh-Durham-Fayetteville,NC,3.0


In [84]:
# can create a list of all topics and do a frequency count -- use a dictionary method similar to midterm

In [85]:
# pickling pres_df

pres_df.to_pickle('data/pres_sorted_data.pickle')