In [5]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
import numpy as np

## Import Impressions data: called "impressions"

In [7]:
impressions_path = '/content/drive/MyDrive/Kinesso/US EDA/US Data Sources/impression_dataframes/impressions_'

In [8]:
# reads the U.S. advertisement impression dataset into the DataFrame "impressions" by iteratively concatenating files in groups of 25
impressions = pd.read_pickle(impressions_path + "0.pkl")
for i in range(16):
  start = (25 * i) + 1
  stop = start + 25
  df_list = [pd.read_pickle(impressions_path + str(j) + ".pkl") for j in range(start, stop)]
  df_concat = pd.concat(df_list)
  impressions = pd.concat([impressions, df_concat])
  del df_list
  del df_concat

In [None]:
impressions.shape

(20050000, 20)

In [None]:
impressions.columns

Index(['impression_id', 'campaign_id', 'ad_group_id', 'audience_id',
       'supply_vendor', 'user_agent', 'region', 'city', 'device_type',
       'os_family', 'os', 'browser', 'media_cost', 'ttd_cost_in_usd',
       'partner_cost_in_usd', 'advertiser_cost_in_usd', 'zip_code',
       'device_make', 'device_model', 'carrier_id'],
      dtype='object')

Data Cleaning

In [17]:
impressions.dropna(subset=['zip_code'], inplace=True)
impressions.reset_index(drop=True, inplace=True)
impressions.head()

Unnamed: 0,impression_id,campaign_id,ad_group_id,audience_id,supply_vendor,user_agent,region,city,device_type,os_family,os,browser,media_cost,ttd_cost_in_usd,partner_cost_in_usd,advertiser_cost_in_usd,zip_code,device_make,device_model,carrier_id
0,feeb8dbc-212a-4fba-8f8e-322431e625bf,el02c6n,exazlvq,,Telaria,Roku/DVP-9.30 (489.30E04194A),Minnesota,Minneapolis,6,1.0,101.0,1.0,0.01877,0.01877,0.019609,0.019609,55405.0,Roku,Digital Video player,
1,b768bf1c-a433-44a1-9614-87f983118c94,pey7wjw,fcucjsu,1qh4zg7,freewheel,Roku/DVP-9.30 (509.30E04194A),California,Oceanside,6,1.0,101.0,1.0,0.0384,0.0384,0.042422,0.042422,92056.0,Roku,Digital Video player,
2,742e59e3-9283-4f32-81eb-7ccfe01e13ae,pey7wjw,fcucjsu,1qh4zg7,freewheel,Roku/DVP-9.30 (519.30E04194A),Kentucky,Lancaster,6,1.0,101.0,1.0,0.0384,0.0384,0.042422,0.042422,40444.0,Roku,Digital Video player,
3,9c298ba2-32e8-4a6e-adad-32f2f004bb95,pey7wjw,fcucjsu,1qh4zg7,freewheel,Roku/DVP-9.30 (919.30E04194A),California,Los Osos,6,1.0,101.0,1.0,0.0384,0.0384,0.042422,0.042422,93402.0,Roku,Digital Video player,
4,66cea51b-267b-4a2e-96c0-fcd276e2521c,el02c6n,exazlvq,,Telaria,Roku/DVP-9.30 (319.30E04194A),Pennsylvania,Philadelphia,6,1.0,101.0,1.0,0.01877,0.01877,0.019609,0.019609,19128.0,Roku,Digital Video player,


## Import demographic Data: called "demo"

In [9]:
demo_path = '/content/drive/MyDrive/Kinesso/US EDA/US Data Sources/zip_age_ethnicity_income_data/demo_data_'

In [10]:
## we need to rename column names, they were just placeholders. i.e. number2 should be titled differenty, etc.

demo = pd.DataFrame(columns = ['zip_code', 'age', 'ethnicity', 'income', 'count']) 
for i in range(19):
  start = (20 * i) + 1
  stop = start + 20
  demo_df_list = [pd.read_csv(demo_path + str(j) + '.csv').set_axis(['zip_code', 'age', 'ethnicity', 'income', 'count'], axis='columns') for j in range(start, stop)]
  demo_df_concat = pd.concat(demo_df_list)
  demo = pd.concat([demo, demo_df_concat])
  del demo_df_list
  del demo_df_concat

In [11]:
demo.head()

Unnamed: 0,zip_code,age,ethnicity,income,count
0,601,18-23,Hispanic,"$1,000 - $14,999",1
1,601,24-29,Hispanic,"$1,000 - $14,999",5
2,601,24-29,White,"$1,000 - $14,999",2
3,601,30-35,Hispanic,"$1,000 - $14,999",20
4,601,30-35,Hispanic,"$15,000 - $19,999",1


## Pre processing for Tableau 
Because the datasets are so large creating a long run time, we have selected a couple sample states instead to create visualizations instead that we would like to iterate through the remaining states.  These show the types of visualizations that could be useful in continuing to understand the distributions.



Replace incomes with ordered numbers 1-14 

In [12]:
demo['income'] = demo['income'].replace('not_reported', 1)
demo['income'] = demo['income'].replace('$1,000 - $14,999', 2)
demo['income'] = demo['income'].replace('$15,000 - $19,999', 3)
demo['income'] = demo['income'].replace('$20,000 - $29,999', 4)
demo['income'] = demo['income'].replace('$30,000 - $39,999', 5)
demo['income'] = demo['income'].replace('$40,000 - $49,999', 6)
demo['income'] = demo['income'].replace('$50,000 - $74,999', 7)
demo['income'] = demo['income'].replace('$75,000 - $99,999', 8)
demo['income'] = demo['income'].replace('$100,000 - $124,999', 9)
demo['income'] = demo['income'].replace('$125,000 - $149,999', 10)
demo['income'] = demo['income'].replace('$150,000 - $174,999', 11)
demo['income'] = demo['income'].replace('$175,000 - $199,999', 12)
demo['income'] = demo['income'].replace('$200,000 - $249,999', 13)
demo['income'] = demo['income'].replace('$250,000 or greater', 14)

##Select One State from impressions: Ex. Iowa

In [13]:
imp_Iowa = impressions[impressions['region']=='Iowa']

In [14]:
impression_count = impressions[impressions['region'] == 'Iowa'][['zip_code']].value_counts()
impression_count

zip_code
50310.0     5503
52722.0     4820
52402.0     4012
50266.0     3207
50701.0     3142
            ... 
52534.0        1
52406.0        1
52359.0        1
52652.0        1
50552.0        1
Length: 820, dtype: int64

In [None]:
impression_count.to_csv('/content/drive/MyDrive/Kinesso/imp_pop_zip_iowa.csv') 

In [None]:
imp_Iowa.to_csv('/content/drive/MyDrive/Kinesso/imp_Iowa.csv')

Because Region does not exist in the demographic data, map the values from the impression dataset to the demographic dataset in order to find zip codes from that corresponding region.

In [18]:
region = impressions[['region', 'zip_code']].drop_duplicates()
map_zip_to_region = region.set_index('zip_code').to_dict().get('region')

demo['region'] = demo['zip_code'].map(map_zip_to_region)
demo.dropna()

Unnamed: 0,zip_code,age,ethnicity,income,count,region
19217,1001,18-23,African American,10,1,Massachusetts
19218,1001,18-23,African American,4,4,Massachusetts
19219,1001,18-23,African American,5,1,Massachusetts
19220,1001,18-23,African American,7,2,Massachusetts
19221,1001,18-23,Asian,4,1,Massachusetts
...,...,...,...,...,...,...
20902,99693,66 and above,Asian,2,2,Alaska
20903,99693,66 and above,Asian,5,2,Alaska
20904,99693,66 and above,Asian,7,1,Alaska
20905,99693,66 and above,Hispanic,2,2,Alaska


## Select One State from demographic data: Ex. Iowa
In this state, grouping zip codes and aggregating by their counts allows us to visualize the population in each zip code frmo this demographic data.

In [19]:
demo_Iowa = demo[demo['region']=='Iowa']
demo_pop_by_zip_iowa = demo_Iowa.groupby(['zip_code']).agg({'count': 'sum'})
demo_pop_by_zip_iowa.rename(columns={"count": "household_count"}, inplace=True)

In [20]:
demo_pop_by_zip_iowa.to_csv('/content/drive/MyDrive/Kinesso/demo_pop_zip_iowa.csv')

In [None]:
demo_pop = demo[demo['region']=='Nevada']['count'].sum()

## Find State Expected and Observed Frequencies to Visualize in Tableau
Because we notice Nevada has a high bias coefficient, we would like to further investigate the zip codes in Nevada and their frequency distribution comparisons.  Thus this pre processed table below looks at zip codes located specifically in Nevada.  The first code aggregates the population in each zip code as found in the demographic data and then finds its frequency by dividing by the total population.  We then find the zip codes associated frequencies in the impressions dataset by aggregating impressions by zip code and dividing by the total number of impressions.

In [24]:
demo_pop = demo[demo['region']=='Nevada']['count'].sum()
demo_nevada = demo[demo['region']=='Nevada']

demo_pop_by_zip = demo_nevada.groupby(['zip_code']).agg({'count': 'sum'})
demo_pop_by_zip.rename(columns={"count": "household_count"}, inplace=True)

demo_pop_by_zip['household_count'] = demo_pop_by_zip['household_count'].astype(float)
demo_pop_by_zip['expected_zip_freq'] = demo_pop_by_zip['household_count'] / demo_pop
demo_pop_by_zip.head()

Unnamed: 0_level_0,household_count,expected_zip_freq
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1
89001,656.0,0.000311
89002,29904.0,0.014158
89003,493.0,0.000233
89005,13193.0,0.006246
89007,619.0,0.000293


In [25]:
imp_by_zip = pd.DataFrame(impressions[impressions['region']=='Nevada']['zip_code'].value_counts()).reset_index().rename(columns={"zip_code": "impression_count", "index": "zip_code"})
imp_by_zip['zip_code'] = imp_by_zip['zip_code'].astype(int)
imp_by_zip.set_index('zip_code', inplace=True)
imp_by_zip['observed_zip_freq'] = impressions[impressions['region']=='Nevada']['zip_code'].value_counts(normalize=True)
imp_by_zip.head()

Unnamed: 0_level_0,impression_count,observed_zip_freq
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1
89110,6848,0.03832
89108,5789,0.032394
89129,5730,0.032064
89031,5366,0.030027
89107,4260,0.023838


In [26]:
nevada_zip_freq = imp_by_zip.join(demo_pop_by_zip)
nevada_zip_freq

Unnamed: 0_level_0,impression_count,observed_zip_freq,household_count,expected_zip_freq
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
89110,6848,0.038320,50480.0,0.023899
89108,5789,0.032394,,
89129,5730,0.032064,43971.0,0.020818
89031,5366,0.030027,55833.0,0.026434
89107,4260,0.023838,28729.0,0.013602
...,...,...,...,...
89822,1,0.000006,1205.0,0.000570
89126,1,0.000006,462.0,0.000219
89432,1,0.000006,1033.0,0.000489
89311,1,0.000006,125.0,0.000059


In [None]:
nevada_zip_freq.to_csv('/content/drive/MyDrive/Kinesso/nevada_zip_freq.csv')