# Cleaning and merging of UWE/Teesside conversions datasets with IDs

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

!!!! Note that UWE and Teesside have different number of 'useless header' rows

In [2]:
# Need to skip the first 11 rows to get to the actual data
raw_data_u = pd.read_csv('Raw_data/UWE/Pivigo_UWE_Conversion.csv', skiprows=11)
# Removes last row, which has the sums of the conversions
raw_data_u = raw_data_u[:-1]
raw_data_u.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Date,Advertiser ID,Advertiser,Campaign ID,Campaign,Site ID (DCM),Site (DCM),Creative ID,Creative,Creative Type,Placement Pixel Size,Platform Type,Activity ID,Activity,Total Conversions,Click-through Conversions,View-through Conversions
0,2018-01-01,4476036,University of the West of England (UWE),10379722,UWE UG2017 HAS Campaign (032996),1434033,The Complete University Guide,(not set),(not set),(not set),1 x 1,Mobile highend: smartphone,1821385,Course View,1.0,1.0,0.0
1,2018-01-01,4476036,University of the West of England (UWE),10379722,UWE UG2017 HAS Campaign (032996),1434033,The Complete University Guide,(not set),(not set),(not set),1 x 1,Mobile highend: smartphone,1892724,Global Floodlight,1.0,1.0,0.0
2,2018-01-01,4476036,University of the West of England (UWE),10469350,UWE UG2017 FBL Campaign (033239),1489701,What Uni,(not set),(not set),(not set),1 x 1,Desktop,1821385,Course View,1.0,1.0,0.0
3,2018-01-01,4476036,University of the West of England (UWE),10469350,UWE UG2017 FBL Campaign (033239),1489701,What Uni,(not set),(not set),(not set),1 x 1,Desktop,1892724,Global Floodlight,6.0,6.0,0.0
4,2018-01-01,4476036,University of the West of England (UWE),10496824,UWE PG2017 MSc Urban and Rural Planning (033135),1503097,Google Display Network,75480110,Tracking Ad - 1x1,Tracking,1 x 1,Desktop,1892724,Global Floodlight,1.0,1.0,0.0


In [3]:
raw_data_t = pd.read_csv('Raw_data/Teesside/Pivigo_Tees_Conversion.csv', skiprows=12)
# Removes last row, which has the sums of the conversions
raw_data_t = raw_data_t[:-1]
raw_data_t.head()

Unnamed: 0,Date,Advertiser ID,Advertiser,Campaign ID,Campaign,Site ID (DCM),Site (DCM),Creative ID,Creative,Creative Type,Placement Pixel Size,Platform Type,Activity ID,Activity,Total Conversions,Click-through Conversions,View-through Conversions
0,2018-01-01,5655503,Teesside University,10037325,DART Search,1317835,DART Search : Google : 18208,(not set),(not set),(not set),1 x 1,Desktop,3586872,SSE School Pages,0.02,0.02,0.0
1,2018-01-01,5655503,Teesside University,10037325,DART Search,1317835,DART Search : Google : 18208,(not set),(not set),(not set),1 x 1,Desktop,3586873,Homepage,7.03,7.03,0.0
2,2018-01-01,5655503,Teesside University,10037325,DART Search,1317835,DART Search : Google : 18208,(not set),(not set),(not set),1 x 1,Desktop,3587251,UG UCAS Application (Exit),1.0,1.0,0.0
3,2018-01-01,5655503,Teesside University,10037325,DART Search,1317835,DART Search : Google : 18208,(not set),(not set),(not set),1 x 1,Desktop,3588414,PG Homepage,1.04,1.04,0.0
4,2018-01-01,5655503,Teesside University,10037325,DART Search,1317835,DART Search : Google : 18208,(not set),(not set),(not set),1 x 1,Desktop,3590016,UG Homepage,1.36,1.36,0.0


In [4]:
raw_data_u.shape

(157712, 17)

In [5]:
raw_data_t.shape

(651571, 17)

In [6]:
raw_data_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651571 entries, 0 to 651570
Data columns (total 17 columns):
Date                         651571 non-null object
Advertiser ID                651571 non-null object
Advertiser                   651571 non-null object
Campaign ID                  651571 non-null object
Campaign                     651571 non-null object
Site ID (DCM)                651571 non-null object
Site (DCM)                   651571 non-null object
Creative ID                  651571 non-null object
Creative                     651571 non-null object
Creative Type                651571 non-null object
Placement Pixel Size         651571 non-null object
Platform Type                651571 non-null object
Activity ID                  651571 non-null object
Activity                     651571 non-null object
Total Conversions            651571 non-null float64
Click-through Conversions    651571 non-null float64
View-through Conversions     651571 non-null float64
dt

In [7]:
raw_data_u.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157712 entries, 0 to 157711
Data columns (total 17 columns):
Date                         157712 non-null object
Advertiser ID                157712 non-null object
Advertiser                   157712 non-null object
Campaign ID                  157712 non-null object
Campaign                     157712 non-null object
Site ID (DCM)                157712 non-null object
Site (DCM)                   157712 non-null object
Creative ID                  157712 non-null object
Creative                     157712 non-null object
Creative Type                157712 non-null object
Placement Pixel Size         157712 non-null object
Platform Type                157712 non-null object
Activity ID                  157712 non-null object
Activity                     157712 non-null object
Total Conversions            157712 non-null float64
Click-through Conversions    157712 non-null float64
View-through Conversions     157712 non-null float64
dt

Combine the two datasets by stacking them on top of each other.

UWE is on top and Teesside is on bottom

In [8]:
raw_data_combine =  pd.concat([raw_data_u, raw_data_t], axis=0)

In [9]:
raw_data_combine

Unnamed: 0,Date,Advertiser ID,Advertiser,Campaign ID,Campaign,Site ID (DCM),Site (DCM),Creative ID,Creative,Creative Type,Placement Pixel Size,Platform Type,Activity ID,Activity,Total Conversions,Click-through Conversions,View-through Conversions
0,2018-01-01,4476036,University of the West of England (UWE),10379722,UWE UG2017 HAS Campaign (032996),1434033,The Complete University Guide,(not set),(not set),(not set),1 x 1,Mobile highend: smartphone,1821385,Course View,1.00,1.00,0.00
1,2018-01-01,4476036,University of the West of England (UWE),10379722,UWE UG2017 HAS Campaign (032996),1434033,The Complete University Guide,(not set),(not set),(not set),1 x 1,Mobile highend: smartphone,1892724,Global Floodlight,1.00,1.00,0.00
2,2018-01-01,4476036,University of the West of England (UWE),10469350,UWE UG2017 FBL Campaign (033239),1489701,What Uni,(not set),(not set),(not set),1 x 1,Desktop,1821385,Course View,1.00,1.00,0.00
3,2018-01-01,4476036,University of the West of England (UWE),10469350,UWE UG2017 FBL Campaign (033239),1489701,What Uni,(not set),(not set),(not set),1 x 1,Desktop,1892724,Global Floodlight,6.00,6.00,0.00
4,2018-01-01,4476036,University of the West of England (UWE),10496824,UWE PG2017 MSc Urban and Rural Planning (033135),1503097,Google Display Network,75480110,Tracking Ad - 1x1,Tracking,1 x 1,Desktop,1892724,Global Floodlight,1.00,1.00,0.00
5,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92414748,11559-02-UG-Decision-Retargetting-E-300x250px,Display,300 x 250,Desktop,1892724,Global Floodlight,4.00,0.00,4.00
6,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92414748,11559-02-UG-Decision-Retargetting-E-300x250px,Display,300 x 250,Mobile highend: smartphone,1821385,Course View,1.00,0.00,1.00
7,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92414748,11559-02-UG-Decision-Retargetting-E-300x250px,Display,300 x 250,Mobile highend: smartphone,1892724,Global Floodlight,7.00,0.00,7.00
8,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92415009,11559-02-UG-Decision-Retargetting-B-160x600px,Display,160 x 600,Desktop,1892724,Global Floodlight,3.00,0.00,3.00
9,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92421730,11559-02-UG-Decision-Retargetting-F-320x50px,Display,320 x 50,Mobile highend: smartphone,1892724,Global Floodlight,4.00,0.00,4.00


In [10]:
# Converts the Advertiser IDs to str types
raw_data_combine['Advertiser ID'] = raw_data_combine['Advertiser ID'].astype(str)

In [11]:
# Converts the Campaign IDs to str types
raw_data_combine['Campaign ID'] = raw_data_combine['Campaign ID'].astype(str)

In [12]:
# Converts the Site ID (DCM)s to str types
raw_data_combine['Site ID (DCM)'] = raw_data_combine['Site ID (DCM)'].astype(str)

'Creative ID' are all strings since most of them are (not set), which means hyperlink

'Creative Type' are: <br>
'(not set)' which is a hyperlink <br>
'Tracking' which is social media <br>
'Display' which is an ad/banner

In [13]:
# Converts the Activity IDs to str types
raw_data_combine['Activity ID'] = raw_data_combine['Activity ID'].astype(str)

In [14]:
raw_data_combine['Advertiser ID'].unique()

array(['4476036', '5655503'], dtype=object)

## Check for inconsistencies between ID's and names

To check if there is 1:1 map between Activity and activity ID:<br>
check number of uniques that have duplicates between activity and activity ID

In [15]:
act_bool = raw_data_combine[['Activity ID','Activity']].groupby('Activity')['Activity ID'].nunique() > 1

In [16]:
raw_data_combine[['Activity ID','Activity']].groupby('Activity')['Activity ID'].unique()[act_bool]

Activity
Offline Conversion Tracking - Phone Call    [4945832, 6048352]
Prospectus Request                          [2975488, 3588126]
Name: Activity ID, dtype: object

conclusion: there are 2 different activitiy IDs for the same activity (split by university)

In [17]:
raw_data_combine['Advertiser'].loc[raw_data_combine['Activity']=='Offline Conversion Tracking - Phone Call'].value_counts()

Teesside University                        5674
University of the West of England (UWE)    1833
Name: Advertiser, dtype: int64

Repeating the same process for the campaign and campaign ID

In [18]:
act_bool_c = raw_data_combine[['Campaign ID','Campaign']].groupby('Campaign')['Campaign ID'].nunique() > 1

In [19]:
sum(act_bool_c)

1

In [20]:
raw_data_combine[['Campaign ID','Campaign']].groupby('Campaign')['Campaign ID'].unique()[act_bool_c]

Campaign
DART Search    [8431696, 10037325]
Name: Campaign ID, dtype: object

In [21]:
raw_data_combine['Advertiser'].loc[raw_data_combine['Campaign']=='DART Search'].value_counts()

Teesside University                        19334
University of the West of England (UWE)     9352
Name: Advertiser, dtype: int64

Repeating the same process for the Advertiser and Advertiser ID

In [22]:
act_bool_a = raw_data_combine[['Advertiser ID','Advertiser']].groupby('Advertiser')['Advertiser ID'].nunique() > 1

In [23]:
sum(act_bool_a)

0

Repeating the same process for the Creative and Creative ID

In [24]:
act_bool_cre = raw_data_combine[['Creative ID','Creative']].groupby('Creative')['Creative ID'].nunique() > 1

In [25]:
sum(act_bool_cre)

148

In [26]:
raw_data_combine[['Creative ID','Creative']].groupby('Creative')['Creative ID'].unique()[act_bool_cre]

Creative
010618a-Clearing-2018-BME-Burst-2-300x600px                                                  [104676165, 104692270]
010618a-Clearing-2018-Burst-2-Programmatic-Mobile-300x250                                    [104532740, 104692942]
010618a-Clearing-2018-Burst-2-Programmatic-Mobile-320x50                                     [104692009, 104532953]
010618a-Clearing-2018-Burst-2-Programmatic-Mobile-728x90                                     [104668083, 104531009]
080219-UCAS-Extra-Digital-Banners-Prospecting-v2-320x50                                      [112424195, 116439510]
120x600_open day1                                                                            [111493359, 113386409]
141118-UG-Digital-Banners-Post-November-Open-Day-300x250-A                                   [108882080, 109676858]
141118-UG-Digital-Banners-Post-November-Open-Day-300x250-B                                   [108884165, 109624969]
141118-UG-Digital-Banners-Post-November-Open-Day-TSR-300x250-A 

For creative it is a bit more complicated; A single creative name may refer to different creative IDs, it happens many times.  

Creative IDs split by Site ID and Advertiser ID

In [27]:
raw_data_combine['Site (DCM)'].loc[raw_data_combine['Creative ID']=='104676165'].value_counts()

SMRS    56
Name: Site (DCM), dtype: int64

In [28]:
raw_data_combine['Creative ID'].nunique()

1978

# Round total conversions to nearest integer

In [29]:
# We're rounding only total conversions since individually the Click/View-Through conversions may be less than 0.5 and round down
Tot_conv_round = round(raw_data_combine['Total Conversions'])

In [30]:
Tot_conv_round.value_counts()

0.0        492330
1.0        163531
2.0         46157
3.0         20870
4.0         13252
5.0          8965
6.0          6775
7.0          5269
8.0          4130
9.0          3354
10.0         2813
11.0         2396
12.0         2122
13.0         1857
14.0         1585
15.0         1463
16.0         1307
17.0         1241
18.0         1134
19.0          982
20.0          922
21.0          838
22.0          752
23.0          728
24.0          621
25.0          580
26.0          564
27.0          498
28.0          498
29.0          463
            ...  
1314.0          1
5253.0          1
20595.0         1
20622.0         1
1289.0          1
1290.0          1
5163.0          1
5165.0          1
1376.0          1
1293.0          1
20691.0         1
20709.0         1
20715.0         1
5187.0          1
20749.0         1
1297.0          1
20801.0         1
1301.0          1
20837.0         1
1303.0          1
864.0           1
1305.0          1
20895.0         1
5225.0          1
5228.0    

In [31]:
raw_data_combine['Total Conversions'] = Tot_conv_round

In [32]:
raw_data_combine['Total Conversions'].value_counts()

0.0        492330
1.0        163531
2.0         46157
3.0         20870
4.0         13252
5.0          8965
6.0          6775
7.0          5269
8.0          4130
9.0          3354
10.0         2813
11.0         2396
12.0         2122
13.0         1857
14.0         1585
15.0         1463
16.0         1307
17.0         1241
18.0         1134
19.0          982
20.0          922
21.0          838
22.0          752
23.0          728
24.0          621
25.0          580
26.0          564
27.0          498
28.0          498
29.0          463
            ...  
1314.0          1
5253.0          1
20595.0         1
20622.0         1
1289.0          1
1290.0          1
5163.0          1
5165.0          1
1376.0          1
1293.0          1
20691.0         1
20709.0         1
20715.0         1
5187.0          1
20749.0         1
1297.0          1
20801.0         1
1301.0          1
20837.0         1
1303.0          1
864.0           1
1305.0          1
20895.0         1
5225.0          1
5228.0    

# Replace 0 x 0 with 1 x 1 in Placement Pixel Size

In [33]:
zz_pix = raw_data_combine['Placement Pixel Size']=='0 x 0'

In [34]:
sum(zz_pix)/len(zz_pix)

0.023800079823745216

In [35]:
raw_data_combine.loc[zz_pix,'Placement Pixel Size'] = '1 x 1'

# Creating Merge ID

The previous version: <br>
Create a new column that combines the Advertiser ID, Campaign ID, Site ID, Creative Type, Pixel size, and Platform type into a unique ID called 'ID' in the 'clean' file <br>
<br>
The new version: <br>
Create a new column that combines the Date, Advertiser ID, Campaign ID, Site ID, Creative ID, Creative Type, Pixel size, and Platform type into a unique ID called 'ID' in the 'clean' file <br>

Creates a string using the above columns

In [36]:
merge_id=raw_data_combine['Advertiser ID'].str.cat((raw_data_combine['Campaign ID'],
                                           raw_data_combine['Site ID (DCM)'],
                                           raw_data_combine['Creative ID'],
                                           raw_data_combine['Creative Type'],
                                           raw_data_combine['Placement Pixel Size'],
                                           raw_data_combine['Platform Type']), sep='_')

In [37]:
merge_id

0         4476036_10379722_1434033_(not set)_(not set)_1...
1         4476036_10379722_1434033_(not set)_(not set)_1...
2         4476036_10469350_1489701_(not set)_(not set)_1...
3         4476036_10469350_1489701_(not set)_(not set)_1...
4         4476036_10496824_1503097_75480110_Tracking_1 x...
5         4476036_20214872_1396136_92414748_Display_300 ...
6         4476036_20214872_1396136_92414748_Display_300 ...
7         4476036_20214872_1396136_92414748_Display_300 ...
8         4476036_20214872_1396136_92415009_Display_160 ...
9         4476036_20214872_1396136_92421730_Display_320 ...
10        4476036_20214872_1396136_92423983_Display_300 ...
11        4476036_20214872_1396136_92452574_Display_728 ...
12        4476036_20214872_1396136_92452574_Display_728 ...
13        4476036_20214872_1396136_93655123_Display_160 ...
14        4476036_20214872_1396136_93655126_Display_300 ...
15        4476036_20214872_1396136_93655126_Display_300 ...
16        4476036_20214872_1396136_93655

Creates column in the data frame called ID and adds the merge_id name

In [38]:
raw_data_combine['ID'] = merge_id

resets index so that the UWE and Teesside indices are sequential and don't repeat

In [39]:
raw_data_combine = raw_data_combine.reset_index()

Delete superflous column

In [40]:
del raw_data_combine['index']

In [41]:
raw_data_combine

Unnamed: 0,Date,Advertiser ID,Advertiser,Campaign ID,Campaign,Site ID (DCM),Site (DCM),Creative ID,Creative,Creative Type,Placement Pixel Size,Platform Type,Activity ID,Activity,Total Conversions,Click-through Conversions,View-through Conversions,ID
0,2018-01-01,4476036,University of the West of England (UWE),10379722,UWE UG2017 HAS Campaign (032996),1434033,The Complete University Guide,(not set),(not set),(not set),1 x 1,Mobile highend: smartphone,1821385,Course View,1.0,1.00,0.00,4476036_10379722_1434033_(not set)_(not set)_1...
1,2018-01-01,4476036,University of the West of England (UWE),10379722,UWE UG2017 HAS Campaign (032996),1434033,The Complete University Guide,(not set),(not set),(not set),1 x 1,Mobile highend: smartphone,1892724,Global Floodlight,1.0,1.00,0.00,4476036_10379722_1434033_(not set)_(not set)_1...
2,2018-01-01,4476036,University of the West of England (UWE),10469350,UWE UG2017 FBL Campaign (033239),1489701,What Uni,(not set),(not set),(not set),1 x 1,Desktop,1821385,Course View,1.0,1.00,0.00,4476036_10469350_1489701_(not set)_(not set)_1...
3,2018-01-01,4476036,University of the West of England (UWE),10469350,UWE UG2017 FBL Campaign (033239),1489701,What Uni,(not set),(not set),(not set),1 x 1,Desktop,1892724,Global Floodlight,6.0,6.00,0.00,4476036_10469350_1489701_(not set)_(not set)_1...
4,2018-01-01,4476036,University of the West of England (UWE),10496824,UWE PG2017 MSc Urban and Rural Planning (033135),1503097,Google Display Network,75480110,Tracking Ad - 1x1,Tracking,1 x 1,Desktop,1892724,Global Floodlight,1.0,1.00,0.00,4476036_10496824_1503097_75480110_Tracking_1 x...
5,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92414748,11559-02-UG-Decision-Retargetting-E-300x250px,Display,300 x 250,Desktop,1892724,Global Floodlight,4.0,0.00,4.00,4476036_20214872_1396136_92414748_Display_300 ...
6,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92414748,11559-02-UG-Decision-Retargetting-E-300x250px,Display,300 x 250,Mobile highend: smartphone,1821385,Course View,1.0,0.00,1.00,4476036_20214872_1396136_92414748_Display_300 ...
7,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92414748,11559-02-UG-Decision-Retargetting-E-300x250px,Display,300 x 250,Mobile highend: smartphone,1892724,Global Floodlight,7.0,0.00,7.00,4476036_20214872_1396136_92414748_Display_300 ...
8,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92415009,11559-02-UG-Decision-Retargetting-B-160x600px,Display,160 x 600,Desktop,1892724,Global Floodlight,3.0,0.00,3.00,4476036_20214872_1396136_92415009_Display_160 ...
9,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92421730,11559-02-UG-Decision-Retargetting-F-320x50px,Display,320 x 50,Mobile highend: smartphone,1892724,Global Floodlight,4.0,0.00,4.00,4476036_20214872_1396136_92421730_Display_320 ...


Output the data to csv, to the Clean_data/ directory 

In [42]:
raw_data_combine.to_csv('Clean_data/Clean_conversion.csv',index=False)

Sanity check we can read in the file we created

There is no need for skipping columns when reading the cleaned data in

In [43]:
tmp=pd.read_csv('Clean_data/Clean_conversion.csv')

In [44]:
tmp

Unnamed: 0,Date,Advertiser ID,Advertiser,Campaign ID,Campaign,Site ID (DCM),Site (DCM),Creative ID,Creative,Creative Type,Placement Pixel Size,Platform Type,Activity ID,Activity,Total Conversions,Click-through Conversions,View-through Conversions,ID
0,2018-01-01,4476036,University of the West of England (UWE),10379722,UWE UG2017 HAS Campaign (032996),1434033,The Complete University Guide,(not set),(not set),(not set),1 x 1,Mobile highend: smartphone,1821385,Course View,1.0,1.00,0.00,4476036_10379722_1434033_(not set)_(not set)_1...
1,2018-01-01,4476036,University of the West of England (UWE),10379722,UWE UG2017 HAS Campaign (032996),1434033,The Complete University Guide,(not set),(not set),(not set),1 x 1,Mobile highend: smartphone,1892724,Global Floodlight,1.0,1.00,0.00,4476036_10379722_1434033_(not set)_(not set)_1...
2,2018-01-01,4476036,University of the West of England (UWE),10469350,UWE UG2017 FBL Campaign (033239),1489701,What Uni,(not set),(not set),(not set),1 x 1,Desktop,1821385,Course View,1.0,1.00,0.00,4476036_10469350_1489701_(not set)_(not set)_1...
3,2018-01-01,4476036,University of the West of England (UWE),10469350,UWE UG2017 FBL Campaign (033239),1489701,What Uni,(not set),(not set),(not set),1 x 1,Desktop,1892724,Global Floodlight,6.0,6.00,0.00,4476036_10469350_1489701_(not set)_(not set)_1...
4,2018-01-01,4476036,University of the West of England (UWE),10496824,UWE PG2017 MSc Urban and Rural Planning (033135),1503097,Google Display Network,75480110,Tracking Ad - 1x1,Tracking,1 x 1,Desktop,1892724,Global Floodlight,1.0,1.00,0.00,4476036_10496824_1503097_75480110_Tracking_1 x...
5,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92414748,11559-02-UG-Decision-Retargetting-E-300x250px,Display,300 x 250,Desktop,1892724,Global Floodlight,4.0,0.00,4.00,4476036_20214872_1396136_92414748_Display_300 ...
6,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92414748,11559-02-UG-Decision-Retargetting-E-300x250px,Display,300 x 250,Mobile highend: smartphone,1821385,Course View,1.0,0.00,1.00,4476036_20214872_1396136_92414748_Display_300 ...
7,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92414748,11559-02-UG-Decision-Retargetting-E-300x250px,Display,300 x 250,Mobile highend: smartphone,1892724,Global Floodlight,7.0,0.00,7.00,4476036_20214872_1396136_92414748_Display_300 ...
8,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92415009,11559-02-UG-Decision-Retargetting-B-160x600px,Display,160 x 600,Desktop,1892724,Global Floodlight,3.0,0.00,3.00,4476036_20214872_1396136_92415009_Display_160 ...
9,2018-01-01,4476036,University of the West of England (UWE),20214872,Decision 2018 (034386),1396136,SMRS,92421730,11559-02-UG-Decision-Retargetting-F-320x50px,Display,320 x 50,Mobile highend: smartphone,1892724,Global Floodlight,4.0,0.00,4.00,4476036_20214872_1396136_92421730_Display_320 ...


# Check effects of date aggregation

These next few lines are used to test the effect of aggregating over date

In [45]:
group_by_ID=raw_data_combine.groupby(['Advertiser ID',
                                    'Campaign ID',
                                    'Site ID (DCM)',
                                    'Creative Type',
                                    'Placement Pixel Size',
                                    'Platform Type',
                                   'Activity ID'])

In [46]:
# Check to see if some dates repeat more than once per ID and activity
date_idx = (group_by_ID['Date'].value_counts().values > 1)

In [47]:
# Max number of dates (ie conversions) for the same activity
max(group_by_ID['Date'].value_counts().values)

23

In [49]:
# Rate of repeated dates over total number of dates
sum(date_idx)/len(date_idx)


0.31613694441292944