# Import Packages

In [None]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# List File Names

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Read Direct ads.txt Records

Here we attempt to save a little bit of RAM by reading chunks and filtering out reseller records as we go.

In [None]:
file_path = '/kaggle/input/wellknown-20210730/wk_ads_2021_07_30.csv'
df_ads_chunks = pd.read_csv(file_path, chunksize=5_000_000)
df_direct = pd.DataFrame()
for ii_chunk, df_ads_chunk in enumerate(df_ads_chunks):
    df_direct_chunk = df_ads_chunk[df_ads_chunk['account_type']=='DIRECT']
    df_direct = pd.concat([df_direct, df_direct_chunk])

## Drop account type column and add seller tag

In [None]:
seller_split = '|@|'
df_direct = df_direct.drop(columns=['account_type'])
df_direct['seller_tag'] = df_direct['seller_id'] + seller_split + df_direct['ad_domain']
# Some ads.txt files include the same account multiple times.
df_direct = df_direct.drop_duplicates()

In [None]:
df_direct

## Report numbers

In [None]:
print('unique host domain: ', df_direct['host_domain'].nunique())
print('unique seller tag: ', df_direct['seller_tag'].nunique())
print('unique ad domain: ', df_direct['ad_domain'].nunique())

## Add shared direct pool size

In [None]:
df_direct['pool_size'] = df_direct.groupby('seller_tag')['seller_tag'].transform('size')

In [None]:
df_direct

# Read sellers.json Records

In [None]:
file_path = '/kaggle/input/wellknown-20210730/wk_sellers_2021_07_30.csv'
df_slr = pd.read_csv(file_path, dtype={'seller_id': str, 'seller_name': str, 'seller_domain': str})
# Drop unused columns
df_slr = df_slr.drop(columns=['is_confidential', 'is_passthrough'])
# Add seller tag
df_slr['seller_tag'] = df_slr['seller_id'] + seller_split + df_slr['ad_domain']
# Lowercase types
df_slr['seller_type'] = df_slr['seller_type'].str.lower()
# there should not be duplicates but we have some from sellers.json files 
# that have duplicate seller_ids created in strange ways
# (for example checkout seller_id 146595 at revcontent.com)
df_slr = df_slr.drop_duplicates(subset=['seller_tag'], keep=False)

In [None]:
df_slr

# Create pools DataFrame and Merge with Sellers

In [None]:
df_pools = pd.merge(
    df_direct.drop_duplicates(subset=['seller_tag'])[['seller_id', 'ad_domain', 'seller_tag', 'pool_size']],
    df_slr.drop(columns=['seller_id', 'ad_domain']),
    on='seller_tag',
    how='left',
)

df_pools['seller_id_or_name'] = df_pools['seller_name']
df_pools['seller_id_or_name'] = df_pools['seller_id_or_name'].fillna(df_pools['seller_id'])

# Set null string fields to 'unknown'
df_pools['seller_type'] = df_pools['seller_type'].fillna('unknown')
df_pools['seller_name'] = df_pools['seller_name'].fillna('unknown')
df_pools['seller_domain'] = df_pools['seller_domain'].fillna('unknown')
df_pools['seller_id_or_name'] = df_pools['seller_id_or_name'].fillna('unknown')

df_pools['log_pool_size'] = np.log10(df_pools['pool_size'])

df_pools

# Record mismatch

## How many direct ads.txt records can be matched to sellers.json records

In [None]:
# fraction of unmatched direct records
df_pools[df_pools['seller_type']=='unknown']['pool_size'].sum() / df_direct.shape[0]

In [None]:
# fraction of unmatched pools
df_pools[df_pools['seller_type']=='unknown'].shape[0] / df_pools.shape[0]

## How many direct ads.txt records are matched with seller_type `intermediary`

In [None]:
df_pools[df_pools['seller_type']=='intermediary']['pool_size'].sum() / df_direct.shape[0]

In [None]:
df_pools[df_pools['seller_type']=='intermediary'].shape[0] / df_pools.shape[0]

# The size of shared direct pools

In [None]:
df_pools[df_pools['pool_size']>1].shape

In [None]:
bins_1 = [
#    ('1', (0.5, 1.5)),
    ('2', (1.5, 2.5)), 
    ('3', (2.5, 3.5)),
    ('4', (3.5, 4.5)),
    ('5', (4.5, 5.5)),
    ('6-10', (5.5, 10.5)), 
    ('11-50', (10.5, 50.5)),
    ('51-100', (50.5, 100.5)),
    ('101-500', (100.5, 500.5)),
    ('501-1k', (500.5, 1000.5)),
    ('1k-5k', (1000.5, 5000.5)),
    ('5k-10k', (5000.5, 10000.5)),
    ('10k-50k', (10000.5, 50000.5)),
]

bins_2 = [
    ('1', (0.5, 1.5)),
    ('2-100', (1.5, 100.5)), 
    ('101-1k', (100.5, 1000.5)),
    ('1k+', (1000.5, 50000.5)),
]

bins_3 = [
    ('2-100', (1.5, 100.5)), 
    ('101-1k', (100.5, 1000.5)),
    ('1k+', (1000.5, 50000.5)),
]

bins_4 = [
    ('2-50', (1.5, 50.5)),
    ('51-100', (50.5, 100.5)),
    ('101-500', (100.5, 500.5)),
    ('501-1k', (500.5, 1000.5)),
    ('1k+', (1000.5, 50000.5)),
]

bins = bins_1
#bins = bins_2
#bins = bins_3
#bins = bins_4

In [None]:
records = []
for indx, (nm, rn) in enumerate(bins):
    m1 = df_pools['pool_size'] > rn[0]
    m2 = df_pools['pool_size'] < rn[1]
    records.append({
        'bin_num': indx,
        'bin_name': nm,
        'num_pools': (m1 & m2).sum(),
        'num_records': df_pools[m1 & m2]['pool_size'].sum()
    })
df_hist = pd.DataFrame.from_records(records)

In [None]:
df_hist = df_hist.sort_values('bin_num')
df_hist['cumu_pools'] = df_hist['num_pools'].cumsum()
df_hist['cumu_records'] = df_hist['num_records'].cumsum()
df_hist

In [None]:
df_line = df_pools[['pool_size']].sort_values('pool_size').copy()
df_line = df_line[df_line['pool_size'] > 1]
df_line['rank'] = np.arange(df_line.shape[0]) + 1
df_line['pool_frac'] = df_line['rank'] / df_line.shape[0]
df_line['pool_perc'] = df_line['pool_frac'] * 100
df_line['records'] = df_line['pool_size'].cumsum()
df_line['records_frac'] = df_line['records'] / df_line.iloc[-1]['records']
df_line['records_perc'] = df_line['records_frac'] * 100

df_line

In [None]:
fig = make_subplots(rows=2, cols=2, horizontal_spacing=0.1, vertical_spacing=0.05, shared_xaxes=True)
color_scale = px.colors.qualitative.D3
bar_color = color_scale[0]
line_color = color_scale[1]

bar_color = 'black' # color_scale[0]
line_color = 'black' # color_scale[1]

fig.add_trace(
    go.Bar(
    x=df_hist['bin_name'],
    y=df_hist['num_pools'],
    text=df_hist['num_pools'],
    textposition='auto',
    name='',
    marker_color=bar_color,
), row=1, col=1)

# we subsample the later part of the continuous line plot
# as the log scale compresses that part of the plot
df_plt = pd.concat([df_line.iloc[:20], df_line.iloc[20:-1:40]])

fig.add_trace(
    go.Scatter(
    x=df_plt['pool_size'],
    y=df_plt['pool_perc'],
    name='',
    marker_color=line_color,
), row=1, col=2)

fig.add_trace(
    go.Bar(
    x=df_hist['bin_name'],
    y=df_hist['num_records'],
    text=df_hist['num_records'],
    textposition='auto',
    name='',
    marker_color=bar_color,
), row=2, col=1)

fig.add_trace(
    go.Scatter(
    x=df_plt['pool_size'],
    y=df_plt['records_perc'],
    name='',
    marker_color=line_color,
), row=2, col=2)

x_axis_name = 'Pool Size (Number of Publishers)'
fig.update_xaxes(row=1, col=1)
fig.update_yaxes(type="linear", title='Number of Pools', row=1, col=1)

fig.update_xaxes(type='log', row=1, col=2)
fig.update_yaxes(type="linear", title='Cumulative % of Pools', row=1, col=2)

fig.update_xaxes(title=x_axis_name, row=2, col=1)
fig.update_yaxes(type="linear", title='Number of Records', row=2, col=1)

fig.update_xaxes(type='log', title=x_axis_name, row=2, col=2)
fig.update_yaxes(type="linear", title='Cumulative % of Records', row=2, col=2)

fig.update_layout(
    showlegend=False, 
    title='Shared Direct Sales Pool Sizes',
    font={'size': 14},
    height=900,
)

fig_name = 'shared_direct_pool_size'
fig.write_html(fig_name + '.html')
fig.show()

 # Vertical Runs?

## 796 - Taboola and TownNews

In [None]:
df_taboola = df_pools[df_pools['pool_size']==796]
df_taboola

In [None]:
df_taboola['ad_domain'].value_counts()

In [None]:
df_taboola['seller_name'].str.contains('TownNews').sum()

### Are the same publishers in all of these pools?

In [None]:
# Find all publishers that include a reference pool as direct.
ref_seller_tag = '1001|@|taboola.com'
ref_pubs = set(df_direct[df_direct['seller_tag'] == ref_seller_tag]['host_domain'].values)
print('Reference pool {}, {} publishers\n'.format(ref_seller_tag, len(ref_pubs)))
print('seller_tag\t\tpercent_overlap')
# Check against the first 20 pools.
for indx, row in df_taboola.head(20).iterrows():
    # Find all publishers that include this pool as direct.
    pubs = set(df_direct[df_direct['seller_tag'] == row['seller_tag']]['host_domain'].values)
    # Check this pool's publishers are the same as the reference pool.
    print('{}\t{}'.format(row['seller_tag'], 1 - len(ref_pubs-pubs) / len(ref_pubs)))

## ~4745 - Cafe Media

In [None]:
df_cafe = df_pools[np.abs(df_pools['pool_size'] - 4745) < 20].sort_values('pool_size')
df_cafe

### Are the same publishers in all of these pools?

In [None]:
# Find all publishers that include a reference pool as direct.
ref_seller_tag = '18727|@|sekindo.com'
ref_pubs = set(df_direct[df_direct['seller_tag'] == ref_seller_tag]['host_domain'].values)
print('Reference pool {}, {} publishers\n'.format(ref_seller_tag, len(ref_pubs)))
print('seller_tag\t\tpercent_overlap')
# Check against the pools.
for indx, row in df_cafe.iterrows():
    # Find all publishers that include this pool as direct.
    pubs = set(df_direct[df_direct['seller_tag'] == row['seller_tag']]['host_domain'].values)
    # Check this pool's publishers are the same as the reference pool.
    print('{}\t{}'.format(row['seller_tag'], 1 - len(ref_pubs-pubs) / len(ref_pubs)))

# Treemaps

In [None]:
color_scale = px.colors.qualitative.D3
color_map = {
    'publisher': color_scale[0],
    'intermediary': color_scale[1],
    'both': color_scale[4],
    'unknown': color_scale[3],
}
hovertemplate = (
    '<b>%{label} </b> <br>    '
    'Seller Type: %{customdata[0]} <br>    '
    'Seller Name: %{customdata[1]} <br>    '
    'Seller Domain: %{customdata[2]} <br>    '
    'Pool Size: %{value} <br>    ' 
)

legend_name = 'seller type'
legend_strings = ['publisher', 'intermediary', 'both', 'unknown']
margin=dict(l=5, r=5, t=0, b=35)
lgnd_treemap = go.Treemap(
    labels = [legend_name] + legend_strings,
    parents = [''] + [legend_name] * 4,
    marker_colors = ['lightgray'] + [color_map[el] for el in legend_strings],
    insidetextfont = {'size': 16},
    outsidetextfont = {"size": 16},
    name = '',
)

In [None]:
def get_plot_data(df_plt, root_name=''):

    df_one = (
        df_plt.groupby('ad_domain')['pool_size'].sum().to_frame('pool_size').reset_index()
    )

    # do root node
    labels = [root_name]
    parents = ['']
    values = [df_one['pool_size'].sum()]
    marker_colors = ['white']
    text = [root_name]
    seller_name = ['']
    seller_type = ['']
    seller_domain = ['']

    # do ad domain parents
    labels += df_one['ad_domain'].to_list()
    parents += [root_name] * df_one.shape[0]
    values += df_one['pool_size'].to_list()
    marker_colors += ['lightgray'] * df_one.shape[0]
    text += df_one['ad_domain'].to_list()
    seller_name += [''] * df_one.shape[0]
    seller_type += [''] * df_one.shape[0]
    seller_domain += [''] * df_one.shape[0]

    # do seller leaves
    labels += df_plt['seller_tag'].to_list()
    parents += df_plt['ad_domain'].to_list()
    values += df_plt['pool_size'].to_list()
    marker_colors += df_plt['seller_type'].apply(lambda x: color_map[x]).to_list()
    text += df_plt['seller_id_or_name'].to_list()
    seller_name += df_plt['seller_name'].to_list()
    seller_type += df_plt['seller_type'].to_list()
    seller_domain += df_plt['seller_domain'].to_list()

    df = pd.DataFrame({
        'labels': labels,
        'parents': parents,
        'values': values,
        'marker_colors': marker_colors,
        'text': text,
        'seller_type': seller_type,
        'seller_name': seller_name,
        'seller_domain': seller_domain,
    })

    return df 

# The Largest Shared Direct Pools

In [None]:
df_pools = df_pools.sort_values('pool_size', ascending=False)

In [None]:
df_plt = df_pools.head(500)
df_tree = get_plot_data(df_plt, root_name='root')
customdata = df_tree[['seller_type', 'seller_name', 'seller_domain']].values

fig = make_subplots(
    rows=2, cols=4,
    row_heights = [0.08, 0.92],
    vertical_spacing = 0.02,
    specs = [
        [None, {'type': 'treemap', 'colspan': 2}, None, None], 
        [{'type': 'treemap', 'colspan': 4}, None, None, None],
    ]
)

fig.add_trace(lgnd_treemap, row=1, col=2)
fig.add_trace(go.Treemap(
    labels = df_tree['labels'],
    parents = df_tree['parents'],
    values = df_tree['values'],
    marker_colors = df_tree['marker_colors'],
    text = df_tree['text'],
    customdata = customdata,
    branchvalues = 'total',
    hovertemplate = hovertemplate, 
    texttemplate = '%{text}',
    insidetextfont = {'size': 16},
    outsidetextfont = {"size": 16},
    name = '',
), row=2, col=1)

fig.update_layout(
    height=1200,
    margin=margin,
)

fig.show()
fig_name = 'treemap_shared_direct_top500'
fig.write_html(fig_name + '.html')

# Focus on Freewheel.tv

In [None]:
df_plt = df_pools[
    (df_pools['ad_domain'] == 'freewheel.tv') & 
    (df_pools['pool_size'] > 100)
]

df_tree = get_plot_data(df_plt, root_name='root')
customdata = df_tree[['seller_type', 'seller_name', 'seller_domain']].values


fig = make_subplots(
    rows=2, cols=4,
    row_heights = [0.06, 0.94],
    vertical_spacing = 0.01,
    specs = [
        [None, {'type': 'treemap', 'colspan': 2}, None, None], 
        [{'type': 'treemap', 'colspan': 4}, None, None, None],
    ]
)

fig.add_trace(lgnd_treemap, row=1, col=2)
fig.add_trace(go.Treemap(
    labels = df_tree['labels'],
    parents = df_tree['parents'],
    values = df_tree['values'],
    marker_colors = df_tree['marker_colors'],
    text = df_tree['text'],
    customdata = customdata,
    branchvalues = 'total',
    hovertemplate = hovertemplate, 
    texttemplate = '%{text}',
    insidetextfont = {'size': 16},
    outsidetextfont = {"size": 16},
    name = '',
), row=2, col=1)

fig.update_layout(
    height=1200,
    margin=margin,
)

fig.show()
fig_name = 'treemap_shared_direct_freewheeltv'
fig.write_html(fig_name + '.html')

# Focus on Taboola

In [None]:
df_plt = df_pools[
    (df_pools['ad_domain'] == 'taboola.com') & 
    (df_pools['pool_size'] > 100)
]

df_tree = get_plot_data(df_plt, root_name='root')
customdata = df_tree[['seller_type', 'seller_name', 'seller_domain']].values


fig = make_subplots(
    rows=2, cols=4,
    row_heights = [0.06, 0.94],
    vertical_spacing = 0.01,
    specs = [
        [None, {'type': 'treemap', 'colspan': 2}, None, None], 
        [{'type': 'treemap', 'colspan': 4}, None, None, None],
    ]
)

fig.add_trace(lgnd_treemap, row=1, col=2)
fig.add_trace(go.Treemap(
    labels = df_tree['labels'],
    parents = df_tree['parents'],
    values = df_tree['values'],
    marker_colors = df_tree['marker_colors'],
    text = df_tree['text'],
    customdata = customdata,
    branchvalues = 'total',
    hovertemplate = hovertemplate, 
    texttemplate = '%{text}',
    insidetextfont = {'size': 16},
    outsidetextfont = {"size": 16},
    name = '',
), row=2, col=1)

fig.update_layout(
    height=1200,
    margin=margin,
)

fig.show()
fig_name = 'treemap_shared_direct_taboola'
fig.write_html(fig_name + '.html')

# Focus on Seller Type = Publisher

In [None]:
df_plt = df_pools[
    (df_pools['pool_size'] > 1_000) & 
    (df_pools['seller_type'].isin(['publisher']))
]
df_tree = get_plot_data(df_plt, root_name='root')
customdata = df_tree[['seller_type', 'seller_name', 'seller_domain']].values

fig = make_subplots(
    rows=2, cols=4,
    row_heights = [0.08, 0.92],
    vertical_spacing = 0.02,
    specs = [
        [None, {'type': 'treemap', 'colspan': 2}, None, None], 
        [{'type': 'treemap', 'colspan': 4}, None, None, None],
    ]
)

fig.add_trace(lgnd_treemap, row=1, col=2)
fig.add_trace(go.Treemap(
    labels = df_tree['labels'],
    parents = df_tree['parents'],
    values = df_tree['values'],
    marker_colors = df_tree['marker_colors'],
    text = df_tree['text'],
    customdata = customdata,
    branchvalues = 'total',
    hovertemplate = hovertemplate, 
    texttemplate = '%{text}',
    insidetextfont = {'size': 16},
    outsidetextfont = {"size": 16},
    name = '',
), row=2, col=1)

fig.update_layout(
    height=1200,
    margin=margin,
)

fig.show()
fig_name = 'treemap_shared_direct_pub'
fig.write_html(fig_name + '.html')

# Focus on Breibart and RT

In [None]:
m1 = df_direct['host_domain'] == 'breitbart.com'
m2 = df_direct['host_domain'] == 'www.rt.com'
df_tmp = df_direct[m1 | m2].groupby('seller_tag').size().to_frame('count').sort_values('count', ascending=False).reset_index()
df_tmp = df_tmp[df_tmp['count'] > 1]
df_plt = df_pools[df_pools['seller_tag'].isin(df_tmp['seller_tag'])]

df_tree = get_plot_data(df_plt, root_name='root')
customdata = df_tree[['seller_type', 'seller_name', 'seller_domain']].values

fig = make_subplots(
    rows=2, cols=4,
    row_heights = [0.08, 0.92],
    vertical_spacing = 0.02,
    specs = [
        [None, {'type': 'treemap', 'colspan': 2}, None, None], 
        [{'type': 'treemap', 'colspan': 4}, None, None, None],
    ]
)

fig.add_trace(lgnd_treemap, row=1, col=2)
fig.add_trace(go.Treemap(
    labels = df_tree['labels'],
    parents = df_tree['parents'],
    values = df_tree['values'],
    marker_colors = df_tree['marker_colors'],
    text = df_tree['text'],
    customdata = customdata,
    branchvalues = 'total',
    hovertemplate = hovertemplate, 
    texttemplate = '%{text}',
    insidetextfont = {'size': 16},
    outsidetextfont = {"size": 16},
    name = '',
), row=2, col=1)

fig.update_layout(
    height=1200,
    margin=margin,
)

fig.show()
fig_name = 'treemap_shared_direct_breitbart_rt'
fig.write_html(fig_name + '.html')

In [None]:
df_plt