## Description:

This file contains:
* Reading in raw input data from SQL tables. Exhaustive list of features contained in this raw input data is mentioned below.

In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import numpy as np
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
import joblib
import warnings
warnings.filterwarnings('ignore') 
import re

import datetime
import plotly.express as px

bq_client = bigquery.Client()

In [2]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage
import time

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

In [3]:
def convert_time(time):
    return datetime.datetime.fromtimestamp(time).strftime('%Y-%m-%d %H:%M:%S')

### Data from SQL Tables

In [4]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_subs_ga`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

subs_data["subscription_status"] = "subscriber"

# drop unnecessary col & rename
subs_data.drop(['user_id_uid', 'resource_id_rid'], axis=1, inplace=True)

subs_data.rename(columns={'ga_pianoId': 
                          'piano_id'}, 
                 inplace=True)

print(subs_data.shape)

print("Unique unlimited subscribers: ", 
      len(subs_data.piano_id.unique()))

print("Unique unlimited fullvids: ", 
      len(subs_data.GA_fullVisitorId.unique()), "\n")

--- 10.377596378326416 seconds ---
(6556736, 23)
Unique unlimited subscribers:  41757
Unique unlimited fullvids:  97947 



In [5]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_nonsubs_ga`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

nonsubs_data["subscription_status"] = "non_subscriber"

print("Before:", nonsubs_data.shape)

print("Unique unlimited fullvids: ", 
      len(nonsubs_data.GA_fullVisitorId.unique()), 
      "\n")

# some suspicious fvids have no pianoID but their dfpZone has 'subscriber' in it - remove them

nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')

suspicious_fvid = nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()

nonsubs_data = nonsubs_data[~nonsubs_data.GA_fullVisitorId.isin(suspicious_fvid)]

print("After removing ", 
      len(suspicious_fvid), 
      "suspicious fvids:", nonsubs_data.shape)

print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")

--- 6.170965671539307 seconds ---
Before removing suspicious fvids: (1192679, 23)
Unique unlimited fullvids:  410000 

After removing  10 suspicious fvids: (1192343, 23)
Unique unlimited fullvids:  409990 



In [7]:
df = pd.concat([subs_data, nonsubs_data])

print("Shape: ", df.shape)

# fill na
df.GA_cmsNaturalId = df.GA_cmsNaturalId.fillna('None') 

# short list device OS
shortlisted_os = ["android", "ios", "macintosh", "windows"]

df["deviceOS"] = np.where(df["GA_deviceOperatingSystem"].isin(shortlisted_os), 
                          df["GA_deviceOperatingSystem"], 
                          "other")

print("total fvids in df: ", 
      len(df.GA_fullVisitorId.unique()), "\n")

df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts()

Shape:  (7749079, 23)
total fvids in df:  507937 



non_subscriber    409990
subscriber         97947
Name: subscription_status, dtype: int64

In [8]:
df.head()

Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status,deviceOS
0,pniwcibyyqmkjv6,296481529627831661,1612917988,2021-02-09,/sites/danalexander/2021/01/19/trumps-cash-str...,article/masthead/default/standard,24,1,,4.0,...,mobile,safari,united states,newsletter,billionaires,none,News and Politics,Politics,subscriber,ios
1,pniztyhjwqj88as,723165197366594676,1618408034,2021-04-14,/sites/jemimamcevoy/2021/04/11/foxs-chris-wall...,article-delta-g/topline/subscriber/alx,494,1,0.0,2.0,...,desktop,chrome,united states,organic search,business,none,News and Politics,Politics,subscriber,macintosh
2,pniexquzkqmkx0o,7477328679965211630,1611066579,2021-01-19,/sites/roberthart/2021/01/19/theyre-trying-to-...,article-delta-d/topline/subscriber/alx,782,1,0.75,65.0,...,mobile,android webview,united states,organic social (forbes),business,none,News and Politics,Politics,subscriber,android
3,pnildwbfbqr5s15,4651205238181546530,1613482573,2021-02-16,/sites/jemimamcevoy/2021/02/15/house-republica...,article/topline/default/standard,60,1,0.0,180.0,...,desktop,chrome,united states,organic social (dark),business,none,News and Politics,National News,subscriber,macintosh
4,pnifksysiqlt6pc,4267761024839534792,1611170253,2021-01-20,/sites/jackbrewster/2021/01/20/we-all-got-play...,article/topline/subscriber/alx,32,1,1.0,35.0,...,desktop,chrome,(not set),organic search,business,none,News and Politics,Politics,subscriber,windows


In [10]:
target_class = df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first')

### Training Data - Features

**Numerical Features**
1. User metrics
    * unique pageviews per session (avg, median)
    * timeOnPage per session (avg, median) 
    * Avg. article views in each month <br><br>
2. Session metrics
    * bounceRate  <br><br>
3. Content metrics
    * contentViewsRate (contentViews / pageViews) <br><br>

**Categorical Features**
1. Content categories 
    * Percentage of pvs - Tier1, Tier2, Primary Channel, Primary Section
    * Avg. top per person - Tier1, Tier2<br><br>
2. Timestamp features: Percentage of pvs -- 
    * Hourly
    * Day of week
    * Month
    * Day of month
    * Minute<br><br>

3. Device OS - Percentage of pvs
4. Referral source - Percentage of pvs
5. Countries - Percentage of pvs


In [11]:
# user's per pagepath GA data
page = df.groupby(['GA_fullVisitorId', 'GA_visitStartTime', 'GA_pagePath']).agg({'GA_pageViews': 'max', 'timeOnPage': 'sum'}).reset_index()

# user's per session GA data
session = page.groupby(['GA_fullVisitorId', 'GA_visitStartTime']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'mean'}).reset_index()

session.rename(columns={'GA_pageViews': 'session_pvs', 'timeOnPage': 'session_top'}, inplace=True)
session

Unnamed: 0,GA_fullVisitorId,GA_visitStartTime,session_pvs,session_top
0,10000145548747950113,1633476002,1,454.00
1,10000150578546110250,1617904679,1,2.00
2,10000150578546110250,1617927195,1,22.00
3,10000150578546110250,1631573550,1,0.00
4,10000150578546110250,1633402180,1,34.00
...,...,...,...,...
3205520,9999969986945834961,1631218067,1,12.00
3205521,9999969986945834961,1633345241,1,0.00
3205522,9999974732591335515,1633982124,1,42.00
3205523,9999975861633509300,1634616135,1,60.00


* **Feature 1: Users' Unique Pageviews in each session (avg, median)**

In [12]:
pvs = session.groupby('GA_fullVisitorId').agg({'session_pvs': ['mean', 'median']}).reset_index()

# rename cols
pvs.columns = [' '.join(col).strip() for col in pvs.columns.values]
pvs.rename(columns={'session_pvs mean':'session_pvs_mean', 'session_pvs median': 'session_pvs_median'}, inplace=True)
pvs

Unnamed: 0,GA_fullVisitorId,session_pvs_mean,session_pvs_median
0,10000145548747950113,1.00,1.00
1,10000150578546110250,1.00,1.00
2,10000154461756058018,1.00,1.00
3,10000244735129548321,1.00,1.00
4,10000256049462665870,1.00,1.00
...,...,...,...
507932,9999935617354687741,1.33,1.00
507933,9999969986945834961,1.00,1.00
507934,9999974732591335515,1.00,1.00
507935,9999975861633509300,1.00,1.00


* **Feature 2:  Users' Time on Page in each session (avg, median)**

In [13]:
top = session.groupby('GA_fullVisitorId').agg({'session_top': ['mean', 'median']}).reset_index()

# rename cols
top.columns = [' '.join(col).strip() for col in top.columns.values]
top.rename(columns={'session_top mean':'session_top_mean', 'session_top median': 'session_top_median'}, inplace=True)

top

Unnamed: 0,GA_fullVisitorId,session_top_mean,session_top_median
0,10000145548747950113,454.00,454.00
1,10000150578546110250,14.50,12.00
2,10000154461756058018,52.00,52.00
3,10000244735129548321,32.00,32.00
4,10000256049462665870,15.00,15.00
...,...,...,...
507932,9999935617354687741,44.83,63.50
507933,9999969986945834961,262.33,143.00
507934,9999974732591335515,42.00,42.00
507935,9999975861633509300,60.00,60.00


In [14]:
def calc_perc_pvs(input_df, cat_col_name, drop_cols=False):
    
    # pivot on user
    df_cat = pd.pivot_table(
        input_df, 
        values='GA_pageViews',
        columns= cat_col_name,
        index='GA_fullVisitorId',
        aggfunc='sum')\
        .reset_index()
    
    # fillna
    df_cat = df_cat.fillna(0)

    # calc user's sum pvs
    df_cat["sum"] = df_cat.sum(axis=1)

    # for all cols except fvid, calc percentage -- i.e. divide each column by sum(pvs)
    df_cat.loc[:, df_cat.columns != 'GA_fullVisitorId'] = df_cat.loc[:, df_cat.columns != 'GA_fullVisitorId'].div(df_cat["sum"], axis=0)

    # drop extra col
    df_cat.drop("sum", axis=1, inplace=True)

    return df_cat

* **Feature 3: Referral sources - Percentage of pvs**

In [15]:
ref = calc_perc_pvs(df, 'GA_referralGroup')
# rename
ref.columns = ref.columns.map(lambda x : 'rf_'+x if x !='GA_fullVisitorId' else x)
ref

GA_referralGroup,GA_fullVisitorId,rf_content aggregators,rf_direct,rf_fbia,rf_newsletter,rf_organic search,rf_organic social (dark),rf_organic social (forbes),rf_paid display,rf_paid search,rf_paid social (dark),rf_paid social (forbes),rf_paid web,rf_referral
0,10000145548747950113,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000150578546110250,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000154461756058018,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,9999935617354687741,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507933,9999969986945834961,0.00,0.50,0.00,0.00,0.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507934,9999974732591335515,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507935,9999975861633509300,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* --Cross checking calculation--

In [16]:
# choose random person
check = pd.DataFrame(df.groupby(['GA_fullVisitorId', 'subscription_status', 'GA_referralGroup']).GA_pageViews.sum()).reset_index()

check[check.GA_fullVisitorId =='6347555056260746840']

Unnamed: 0,GA_fullVisitorId,subscription_status,GA_referralGroup,GA_pageViews
450111,6347555056260746840,subscriber,direct,875
450112,6347555056260746840,subscriber,organic search,45375


In [17]:
print(round(875/(875 + 45375), 2))
print(round(45375/(875 + 45375), 2))

# calc matches
ref[ref.GA_fullVisitorId =='6347555056260746840']

0.02
0.98


GA_referralGroup,GA_fullVisitorId,rf_content aggregators,rf_direct,rf_fbia,rf_newsletter,rf_organic search,rf_organic social (dark),rf_organic social (forbes),rf_paid display,rf_paid search,rf_paid social (dark),rf_paid social (forbes),rf_paid web,rf_referral
383270,6347555056260746840,0.0,0.02,0.0,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# checking - everything is summed to 1

ref.sum(axis=1).unique()

array([1., 1., 1., 1.])

* **Feature 4: Country - Percentage of pvs**

In [19]:
print("unqiue countries in df", len(df['GA_country'].unique()))

subs_top_ct = ['united states', 'russia', 'canada', 'united kingdom', 'japan']
nonsubs_top_ct = ['australia', 'india', 'singapore', 'germany', 'philippines']

shortlisted_countries = subs_top_ct + nonsubs_top_ct

# shortlist country
df["GA_country"] = np.where(df["GA_country"].isin(shortlisted_countries), 
                            df["GA_country"], 
                            "other")

unqiue countries in df 224


In [20]:
pd.DataFrame(df.groupby(["GA_country"]).GA_pageViews.sum()).reset_index().sort_values('GA_pageViews', ascending=False)

Unnamed: 0,GA_country,GA_pageViews
10,united states,6656561
5,other,382779
1,canada,183565
9,united kingdom,149874
7,russia,115013
0,australia,84921
3,india,60602
4,japan,54312
8,singapore,25933
2,germany,18208


In [21]:
# drop 'other' rows
ct_input = df[df.GA_country != 'other']

# calc perc of pvs of user in shortlisted categories
country = calc_perc_pvs(ct_input, 'GA_country')
country.columns = country.columns.map(lambda x : 'ct_'+x if x !='GA_fullVisitorId' else x)
country

GA_country,GA_fullVisitorId,ct_australia,ct_canada,ct_germany,ct_india,ct_japan,ct_philippines,ct_russia,ct_singapore,ct_united kingdom,ct_united states
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...
462296,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
462297,9999969986945834961,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
462298,9999974732591335515,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
462299,9999975861633509300,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00


* Needs discussion

In [None]:
test = df[df.GA_country.isin(shortlisted_countries)]
test.shape

print("total fvids in df: ", 
      len(test.GA_fullVisitorId.unique()), "\n")

test[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts()

In [58]:
# calc perc of pvs of user in shortlisted categories
test_country = calc_perc_pvs(test, 'GA_country')
test_country.columns = test_country.columns.map(lambda x : 'ct_'+x if x !='GA_fullVisitorId' else x)
test_country

GA_country,GA_fullVisitorId,ct_australia,ct_canada,ct_germany,ct_india,ct_japan,ct_philippines,ct_russia,ct_singapore,ct_united kingdom,ct_united states
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...
462296,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
462297,9999969986945834961,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
462298,9999974732591335515,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
462299,9999975861633509300,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00


* **Feature 5: Device OS - Percentage of pvs**

In [22]:
device_os = calc_perc_pvs(df, 'deviceOS')
device_os.columns = device_os.columns.map(lambda x : 'dos_'+x if x !='GA_fullVisitorId' else x)
device_os

deviceOS,GA_fullVisitorId,dos_android,dos_ios,dos_macintosh,dos_other,dos_windows
0,10000145548747950113,0.00,1.00,0.00,0.00,0.00
1,10000150578546110250,0.00,1.00,0.00,0.00,0.00
2,10000154461756058018,1.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,1.00,0.00,0.00,0.00
4,10000256049462665870,0.00,1.00,0.00,0.00,0.00
...,...,...,...,...,...,...
507932,9999935617354687741,1.00,0.00,0.00,0.00,0.00
507933,9999969986945834961,1.00,0.00,0.00,0.00,0.00
507934,9999974732591335515,0.00,1.00,0.00,0.00,0.00
507935,9999975861633509300,1.00,0.00,0.00,0.00,0.00


### Content categories
* Content categories = IAB Tier 1, Tier 2, PC (shortlisted), PS (shortlisted)

In [23]:
content = df.copy()

# extract the start of natid string
content["natid_start"] = content.GA_cmsNaturalId.str.split("/").str[0]

print("Shape before: ", content.shape)

# keep only blogs, slides, magazine data
content = content[(content['natid_start'].str.contains('blogandpostid')) | (content['natid_start'].str.contains('blogandslideid')) | (content['natid_start'].str.contains('magazine'))]

print("Shape after: ", content.shape)

# get month-year
content.GA_date = pd.to_datetime(content.GA_date)
content["mon_year"] = content['GA_date'].dt.to_period('M')

Shape before:  (7749079, 25)
Shape after:  (5412219, 25)


In [24]:
content[['GA_fullVisitorId', 
         'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts()

non_subscriber    409990
subscriber         90398
Name: subscription_status, dtype: int64

In [25]:
cat = content[['GA_fullVisitorId', 'tier1', 'tier2', 'GA_primaryChannel', 'GA_primarySection', 'subscription_status', 'GA_pageViews', 'timeOnPage']].copy()

print("Before - unique PC: ", len(cat.GA_primaryChannel.unique()))
print("Before - unique PS: ", len(cat.GA_primarySection.unique()), "\n")

shortlisted_channel = ['business', 'leadership', 'money', 'innovation', 'lifestyle', 'home', 
                       'billionaires', 'small business', 'consumer', 'shopping', 'industry', 'investing', 
                       'tech', 'entrepreneurs', 'newsletters', 'asia', 'opinion', 'real estate', 'lists', 
                       'forbes finds', 'under 30', 'advisor', 'video']

shortlisted_section = ['careers', 'personal finance', 'markets', 'forbeswomen', 'leadership strategy', 'healthcare', 'travel', 'sportsmoney', 'retail', 'entrepreneurs', 'science', 
                        'taxes', 'policy', 'consumer tech', 'investing', 'retirement', 'education', 'cmo network', 'real estate', 'hollywood & entertainment', 'cybersecurity', 
                        'aerospace & defense', 'diversity & inclusion', 'energy', 'food & drink', 'enterprise & cloud', 'enterprise tech', 'transportation', 'crypto & blockchain', 'games', 
                        'money & politics', 'media', 'fintech', 'venture capital', 'forbeslife', 'vices', 'manufacturing', 'small business strategy', 'hedge funds & private equity', 
                        'arts', 'ai', 'cio network', 'cars & bikes', 'banking & insurance', 'cfo network', 'spirits', 'cloud', 'dining', 'confirmation', 'wealth management']

# primary channel
cat["GA_primaryChannel"] = np.where(cat["GA_primaryChannel"].isin(shortlisted_channel), 
                                    cat["GA_primaryChannel"], "other")

# primary section 
cat["GA_primarySection"] = np.where(cat["GA_primarySection"].isin(shortlisted_section), 
                                    cat["GA_primarySection"], "other")

print("After shortlisting - unique PC: ", len(cat.GA_primaryChannel.unique()))
print("After shortlisting - unique PS: ", len(cat.GA_primarySection.unique()), "\n")

# fillna with 0
cat.timeOnPage = cat.timeOnPage.fillna(0)

print("Unique all T1s: ", len(cat.tier1.unique()))  
print("Unique all T2s: ", len(cat.tier2.unique()))  

cat.isna().sum()

Before - unique PC:  32
Before - unique PS:  120 

After shortlisting - unique PC:  23
After shortlisting - unique PS:  49 

Unique all T1s:  31
Unique all T2s:  324


GA_fullVisitorId             0
tier1                  1005613
tier2                  1359629
GA_primaryChannel            0
GA_primarySection            0
subscription_status          0
GA_pageViews                 0
timeOnPage                   0
dtype: int64

In [26]:
cat.dropna(inplace=True)

In [27]:
cat.isna().sum()

GA_fullVisitorId       0
tier1                  0
tier2                  0
GA_primaryChannel      0
GA_primarySection      0
subscription_status    0
GA_pageViews           0
timeOnPage             0
dtype: int64

* **Feature 6: Tier 1 - Percentage of pvs**

In [28]:
t1 = calc_perc_pvs(cat, 'tier1')
t1.columns = t1.columns.map(lambda x : 't1_'+x if x !='GA_fullVisitorId' else x)
t1

tier1,GA_fullVisitorId,t1_Automotive,t1_Books and Literature,t1_Business and Finance,t1_Careers,t1_Content Channel,t1_Education,t1_Events and Attractions,t1_Family and Relationships,t1_Fine Art,...,t1_Real Estate,t1_Religion & Spirituality,t1_Science,t1_Shopping,t1_Sports,t1_Style & Fashion,t1_Technology & Computing,t1_Television,t1_Travel,t1_Video Gaming
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398443,9999714828652623441,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.50,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
398444,9999771187527491562,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
398445,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00
398446,9999969986945834961,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.33,0.00,0.00,0.00,0.67,0.00,0.00,0.00


* **Feature 7: Tier 2 - Percentage of pvs**

In [29]:
t2 = calc_perc_pvs(cat, 'tier2')
t2.columns = t2.columns.map(lambda x : 't2_'+x if x !='GA_fullVisitorId' else x)
t2

tier2,GA_fullVisitorId,t2_,t2_Action and Adventure Movies,t2_Adult Contemporary Music,t2_Adult Education,t2_Alcoholic Beverages,t2_Alternative Music,t2_American Football,t2_Amusement and Theme Parks,t2_Animation Movies,...,t2_Wellness,t2_Women's Fashion,t2_Women's Health,t2_Workshops and Classes,t2_World Cuisines,t2_World Movies,t2_Wrestling,t2_Young Adult Literature,t2_Zoos & Aquariums,t2_eSports
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398443,9999714828652623441,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
398444,9999771187527491562,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
398445,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
398446,9999969986945834961,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 8: Prim Channel- Percentage of pvs**

In [30]:
# there are rows where prim channel = other

cat[cat.GA_primaryChannel =="other"]

Unnamed: 0,GA_fullVisitorId,tier1,tier2,GA_primaryChannel,GA_primarySection,subscription_status,GA_pageViews,timeOnPage
508,7638489957633409219,Travel,Travel Type,other,other,subscriber,1,2.00
515,1668757781505543510,Travel,Travel Type,other,other,subscriber,1,9.00
560,7425471167558058039,Travel,Travel Type,other,other,subscriber,1,1.00
654,5142079099003020566,Travel,Travel Accessories,other,other,subscriber,1,38.00
1122,1821568513148974997,News and Politics,National News,other,other,subscriber,1,3.00
...,...,...,...,...,...,...,...,...
1144386,5016556404887001663,Home & Garden,Home Appliances,other,other,non_subscriber,1,2.00
1144387,1077032199174312348,Home & Garden,Home Appliances,other,other,non_subscriber,1,36.00
1144388,8408659729235350144,Home & Garden,Home Appliances,other,other,non_subscriber,1,59.00
1144389,15555251682534230992,Home & Garden,Home Appliances,other,other,non_subscriber,1,18.00


In [31]:
# check distribution of subsc/nonsubs if "other" is removed

cat[cat.GA_primaryChannel != 'other'][['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts()

non_subscriber    300555
subscriber         86937
Name: subscription_status, dtype: int64

In [32]:
# drop 'other' rows
pc_input = cat[cat.GA_primaryChannel != 'other']

# calc perc of pvs of user in shortlisted categories
pc = calc_perc_pvs(pc_input, 'GA_primaryChannel')
pc.columns = pc.columns.map(lambda x : 'pc_'+x if x !='GA_fullVisitorId' else x)
pc

GA_primaryChannel,GA_fullVisitorId,pc_advisor,pc_asia,pc_billionaires,pc_business,pc_consumer,pc_entrepreneurs,pc_forbes finds,pc_industry,pc_innovation,...,pc_leadership,pc_lifestyle,pc_money,pc_newsletters,pc_opinion,pc_real estate,pc_shopping,pc_small business,pc_tech,pc_video
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387487,9999611965095036174,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
387488,9999714828652623441,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
387489,9999771187527491562,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
387490,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 9: Prim Section - Percentage of pvs**

In [33]:
# drop 'other' rows
ps_input = cat[cat.GA_primarySection != 'other']

# calc perc of pvs of user in shortlisted categories
ps = calc_perc_pvs(ps_input, 'GA_primarySection')
ps.columns = ps.columns.map(lambda x : 'ps_'+x if x !='GA_fullVisitorId' else x)
ps

GA_primarySection,GA_fullVisitorId,ps_aerospace & defense,ps_ai,ps_arts,ps_banking & insurance,ps_careers,ps_cars & bikes,ps_cfo network,ps_cio network,ps_cloud,...,ps_science,ps_small business strategy,ps_spirits,ps_sportsmoney,ps_taxes,ps_transportation,ps_travel,ps_venture capital,ps_vices,ps_wealth management
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.33,0.00,0.00,0.00,0.00,0.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000257060996510203,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.20,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000310598604879673,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357030,9999600642589543142,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
357031,9999611965095036174,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
357032,9999771187527491562,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
357033,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 10: Tier 1 - Avg. time on page**

In [34]:
t1 = cat.pivot_table(index=['GA_fullVisitorId'], 
                     columns='tier1',
                     values=['timeOnPage', 'GA_pageViews'], 
                     aggfunc='sum', 
                     fill_value=0).reset_index()
# set aside fvids
fvids = list(t1.GA_fullVisitorId)

# calc avg. top
t1_top = t1["timeOnPage"]/t1["GA_pageViews"]
t1_top = t1_top.fillna(0)

t1_top["GA_fullVisitorId"] = fvids
t1_top

tier1,Automotive,Books and Literature,Business and Finance,Careers,Content Channel,Education,Events and Attractions,Family and Relationships,Fine Art,Food & Drink,...,Religion & Spirituality,Science,Shopping,Sports,Style & Fashion,Technology & Computing,Television,Travel,Video Gaming,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,454.00,0.00,0.00,10000145548747950113
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000150578546110250
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000154461756058018
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000244735129548321
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,15.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000256049462665870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398443,0.00,0.00,0.00,0.00,0.00,0.00,0.00,43.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999714828652623441
398444,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10.00,9999771187527491562
398445,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,66.00,0.00,0.00,0.00,9999935617354687741
398446,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,263.50,0.00,0.00,0.00,261.75,0.00,0.00,0.00,9999969986945834961


* **Feature 11: Tier 2 - Avg. time on page**

In [35]:
t2 = cat.pivot_table(index=['GA_fullVisitorId'], 
                     columns='tier2',
                     values=['timeOnPage', 'GA_pageViews'], 
                     aggfunc='sum', 
                     fill_value=0).reset_index()
# set aside fvids
fvids = list(t2.GA_fullVisitorId)

# calc avg. top
t2_top = t2["timeOnPage"]/t2["GA_pageViews"]
t2_top = t2_top.fillna(0)

t2_top["GA_fullVisitorId"] = fvids

t2_top

tier2,Unnamed: 1,Action and Adventure Movies,Adult Contemporary Music,Adult Education,Alcoholic Beverages,Alternative Music,American Football,Amusement and Theme Parks,Animation Movies,Animation TV,...,Women's Fashion,Women's Health,Workshops and Classes,World Cuisines,World Movies,Wrestling,Young Adult Literature,Zoos & Aquariums,eSports,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000145548747950113
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000150578546110250
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000154461756058018
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000244735129548321
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000256049462665870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398443,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999714828652623441
398444,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999771187527491562
398445,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999935617354687741
398446,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999969986945834961


* **Feature 11: Avg. article views in each month**

In [36]:
user_per_mon = pd.DataFrame(content.groupby(['GA_fullVisitorId', 'mon_year']).GA_pageViews.sum()).reset_index().rename(columns= {'GA_pageViews':'total_article_views'})
user_per_mon

Unnamed: 0,GA_fullVisitorId,mon_year,total_article_views
0,10000145548747950113,2021-10,1
1,10000150578546110250,2021-04,2
2,10000150578546110250,2021-09,1
3,10000150578546110250,2021-10,1
4,10000154461756058018,2021-10,1
...,...,...,...
977887,9999969986945834961,2021-09,1
977888,9999969986945834961,2021-10,1
977889,9999974732591335515,2021-10,1
977890,9999975861633509300,2021-10,1


In [37]:
per_mon =  pd.DataFrame(user_per_mon.groupby('GA_fullVisitorId').total_article_views.mean()).reset_index() 

per_mon.GA_cmsNaturalId = per_mon.total_article_views.round() 
per_mon

Unnamed: 0,GA_fullVisitorId,total_article_views
0,10000145548747950113,1.00
1,10000150578546110250,1.33
2,10000154461756058018,1.00
3,10000244735129548321,1.00
4,10000256049462665870,1.00
...,...,...
500383,9999935617354687741,2.00
500384,9999969986945834961,1.50
500385,9999974732591335515,1.00
500386,9999975861633509300,1.00


* **Feature 12: Bounce rate** 

In [38]:
print(session.shape)
print(len(session.GA_fullVisitorId.unique()))

(3205525, 4)
507937


In [39]:
def b_rate(g):
    '''for each fvid: calculate percentage of sessions comprising only 1PV'''
    
    # count sessions w/ pv = 1
    sessions_w_1pv = g[g['session_pvs']==1].shape[0]
    
    # count total sessions
    total_sessions = g.shape[0]
    
    # calculate ratio
    return (sessions_w_1pv)/total_sessions

In [40]:
br = pd.DataFrame(session.groupby('GA_fullVisitorId').apply(lambda x: b_rate(x))).reset_index().rename(columns={0:'bounce_rate'}) # takes 3mins
br

Unnamed: 0,GA_fullVisitorId,bounce_rate
0,10000145548747950113,1.00
1,10000150578546110250,1.00
2,10000154461756058018,1.00
3,10000244735129548321,1.00
4,10000256049462665870,1.00
...,...,...
507932,9999935617354687741,0.67
507933,9999969986945834961,1.00
507934,9999974732591335515,1.00
507935,9999975861633509300,1.00


* **Feature 12: Content views rate**

In [41]:
natid_page_map = df[['GA_pagePath', 'GA_cmsNaturalId', 'publish_date']].sort_values('publish_date', ascending=False).drop_duplicates('GA_pagePath')

In [55]:
page = pd.merge(page, 
                natid_page_map, 
                how="left", 
                on="GA_pagePath")

In [43]:
def c_views_rate(g):
    '''for each fvid: calculate percentage PVs that are actually views on content pages; vs non-content pages such as the home page, channel/section landing pages, author pages, etc.'''
    
    # sum pvs on actual content for user
    content_sum_pv = g[g.GA_cmsNaturalId.str.contains("blogandpostid|blogandslideid|galleryid|video")].GA_pageViews.sum()
    
    # sum all pvs for user
    total_pv = g.GA_pageViews.sum()
    
    # calculate ratio
    return (content_sum_pv)/total_pv

In [44]:
cvr = pd.DataFrame(page.groupby('GA_fullVisitorId').apply(lambda x: c_views_rate(x))).reset_index().rename(columns={0:'content_views_rate'}) # takes 5mins
cvr

Unnamed: 0,GA_fullVisitorId,content_views_rate
0,10000145548747950113,1.00
1,10000150578546110250,1.00
2,10000154461756058018,1.00
3,10000244735129548321,1.00
4,10000256049462665870,1.00
...,...,...
507932,9999935617354687741,1.00
507933,9999969986945834961,1.00
507934,9999974732591335515,1.00
507935,9999975861633509300,1.00


### Timestamp features

In [45]:
df['session_time']=df['GA_visitStartTime'].apply(convert_time)

In [46]:
df['session_time']=pd.to_datetime(df['session_time'],errors='coerce')
df['GA_date']=pd.to_datetime(df['GA_date'],errors='coerce')

In [47]:
df['dayofweek'] = df.GA_date.dt.weekday
df['day'] = df.GA_date.dt.day
df['month'] = df.GA_date.dt.month
df['hour']=df.session_time.dt.hour
df['minute']=df.session_time.dt.minute

In [48]:
df.head()

Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,tier1,tier2,subscription_status,deviceOS,session_time,dayofweek,day,month,hour,minute
0,pniwcibyyqmkjv6,296481529627831661,1612917988,2021-02-09,/sites/danalexander/2021/01/19/trumps-cash-str...,article/masthead/default/standard,24,1,,4.0,...,News and Politics,Politics,subscriber,ios,2021-02-10 00:46:28,1,9,2,0,46
1,pniztyhjwqj88as,723165197366594676,1618408034,2021-04-14,/sites/jemimamcevoy/2021/04/11/foxs-chris-wall...,article-delta-g/topline/subscriber/alx,494,1,0.0,2.0,...,News and Politics,Politics,subscriber,macintosh,2021-04-14 13:47:14,2,14,4,13,47
2,pniexquzkqmkx0o,7477328679965211630,1611066579,2021-01-19,/sites/roberthart/2021/01/19/theyre-trying-to-...,article-delta-d/topline/subscriber/alx,782,1,0.75,65.0,...,News and Politics,Politics,subscriber,android,2021-01-19 14:29:39,1,19,1,14,29
3,pnildwbfbqr5s15,4651205238181546530,1613482573,2021-02-16,/sites/jemimamcevoy/2021/02/15/house-republica...,article/topline/default/standard,60,1,0.0,180.0,...,News and Politics,National News,subscriber,macintosh,2021-02-16 13:36:13,1,16,2,13,36
4,pnifksysiqlt6pc,4267761024839534792,1611170253,2021-01-20,/sites/jackbrewster/2021/01/20/we-all-got-play...,article/topline/subscriber/alx,32,1,1.0,35.0,...,News and Politics,Politics,subscriber,windows,2021-01-20 19:17:33,2,20,1,19,17


* **Feature 13: Day of week - Percentage of pvs**

In [49]:
dow = calc_perc_pvs(df, 'dayofweek')
dow.columns = dow.columns.map(lambda x : 'day_'+str(x) if x !='GA_fullVisitorId' else x)
dow

dayofweek,GA_fullVisitorId,day_0,day_1,day_2,day_3,day_4,day_5,day_6
0,10000145548747950113,0.00,1.00,0.00,0.00,0.00,0.00,0.00
1,10000150578546110250,0.50,0.00,0.00,0.50,0.00,0.00,0.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,1.00,0.00
3,10000244735129548321,0.00,1.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,1.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...
507932,9999935617354687741,0.00,0.00,0.00,0.25,0.00,0.00,0.75
507933,9999969986945834961,0.17,0.00,0.00,0.33,0.50,0.00,0.00
507934,9999974732591335515,1.00,0.00,0.00,0.00,0.00,0.00,0.00
507935,9999975861633509300,0.00,1.00,0.00,0.00,0.00,0.00,0.00


* **Feature 14: Hourly - Percentage of pvs**

In [50]:
hour = calc_perc_pvs(df, 'hour')
hour.columns = hour.columns.map(lambda x : 'hour_'+str(x) if x !='GA_fullVisitorId' else x)
hour

hour,GA_fullVisitorId,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
1,10000150578546110250,0.25,0.00,0.25,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.25,0.00,0.00,0.00,0.00,0.25,0.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,9999935617354687741,0.00,0.50,0.25,0.00,0.00,0.25,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507933,9999969986945834961,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.17,0.00,0.00,0.17,0.17,0.17,0.00
507934,9999974732591335515,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
507935,9999975861633509300,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 15: Day of month - Percentage of pvs**

In [51]:
day_of_mon = calc_perc_pvs(df, 'day')
day_of_mon.columns = day_of_mon.columns.map(lambda x : 'day_of_mon_'+str(x) if x !='GA_fullVisitorId' else x)
day_of_mon

day,GA_fullVisitorId,day_of_mon_1,day_of_mon_2,day_of_mon_3,day_of_mon_4,day_of_mon_5,day_of_mon_6,day_of_mon_7,day_of_mon_8,day_of_mon_9,...,day_of_mon_22,day_of_mon_23,day_of_mon_24,day_of_mon_25,day_of_mon_26,day_of_mon_27,day_of_mon_28,day_of_mon_29,day_of_mon_30,day_of_mon_31
0,10000145548747950113,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000150578546110250,0.00,0.00,0.00,0.25,0.00,0.00,0.00,0.50,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000154461756058018,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,9999935617354687741,0.00,0.00,0.50,0.25,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507933,9999969986945834961,0.00,0.00,0.00,0.17,0.50,0.00,0.00,0.00,0.17,...,0.00,0.00,0.17,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507934,9999974732591335515,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507935,9999975861633509300,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 16: Per month - Percentage of pvs**
    * these features may not be present at prediction time? - discuss

In [52]:
mon = calc_perc_pvs(df, 'month')
mon.columns = mon.columns.map(lambda x : 'mon_'+str(x) if x !='GA_fullVisitorId' else x)
mon

month,GA_fullVisitorId,mon_1,mon_2,mon_3,mon_4,mon_5,mon_6,mon_7,mon_8,mon_9,mon_10
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
1,10000150578546110250,0.00,0.00,0.00,0.50,0.00,0.00,0.00,0.00,0.25,0.25
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...
507932,9999935617354687741,0.00,0.25,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.75
507933,9999969986945834961,0.00,0.50,0.00,0.00,0.00,0.17,0.00,0.00,0.17,0.17
507934,9999974732591335515,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
507935,9999975861633509300,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00


* **Feature 17: Minute - Percentage of pvs or avg. top**
    * whats the hypothesis? discuss?

In [53]:
minute = df.pivot_table(index=['GA_fullVisitorId'], 
                     columns='minute',
                     values=['timeOnPage', 'GA_pageViews'], 
                     aggfunc='sum', 
                     fill_value=0).reset_index()
# set aside fvids
fvids = list(minute.GA_fullVisitorId)

# calc avg. top
minute_top = minute["timeOnPage"]/minute["GA_pageViews"]
minute_top = minute_top.fillna(0)

minute_top["GA_fullVisitorId"] = fvids

minute_top

minute,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000145548747950113
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,2.00,0.00,0.00,10000150578546110250
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000154461756058018
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000244735129548321
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000256049462665870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,0.00,0.00,0.00,0.00,0.00,71.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999935617354687741
507933,0.00,0.00,0.00,0.00,0.00,0.00,0.00,12.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999969986945834961
507934,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,42.00,0.00,0.00,0.00,0.00,9999974732591335515
507935,0.00,0.00,60.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999975861633509300


In [54]:
minute = calc_perc_pvs(df, 'minute')
minute.columns = minute.columns.map(lambda x : 'minute_'+str(x) if x !='GA_fullVisitorId' else x)
minute

minute,GA_fullVisitorId,minute_0,minute_1,minute_2,minute_3,minute_4,minute_5,minute_6,minute_7,minute_8,...,minute_50,minute_51,minute_52,minute_53,minute_54,minute_55,minute_56,minute_57,minute_58,minute_59
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.25,0.00,0.00,0.00,0.00,0.25,0.00,0.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.25,0.00,0.00,0.00,...,0.00,0.25,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507933,9999969986945834961,0.17,0.00,0.00,0.00,0.00,0.00,0.00,0.17,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507934,9999974732591335515,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
507935,9999975861633509300,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


### Combine all features in 1 df

In [71]:
print("Total users:", len(df.GA_fullVisitorId.unique()))

Total users: 507937


In [72]:
print("Users with all data present in following categories:")
print(pvs.shape)
print(top.shape)
print(ref.shape)
print(device_os.shape)
print(br.shape)
print(cvr.shape)
print(dow.shape)
print(hour.shape)
print(day_of_mon.shape)

Users with all data present in following categories:
(507937, 3)
(507937, 3)
(507937, 14)
(507937, 6)
(507937, 2)
(507937, 2)
(507937, 8)
(507937, 25)
(507937, 32)


In [73]:
print("Some users had data missing in following categories:")
print(country.shape)
print(t1.shape)
print(t2.shape)
print(pc.shape)
print(ps.shape)
print(t1_top.shape)
print(t2_top.shape)
print(per_mon.shape)

Some users had data missing in following categories:
(462301, 11)
(398448, 61)
(398448, 647)
(387492, 21)
(357035, 49)
(398448, 31)
(398448, 324)
(500388, 2)


In [90]:
ex = df[~df.GA_fullVisitorId.isin(country.GA_fullVisitorId)]

# ex[ex.GA_fullVisitorId =='1229229923871942198'].GA_country.unique()
ex

Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,tier1,tier2,subscription_status,deviceOS,session_time,dayofweek,day,month,hour,minute
73,pnihijrfpqjwqxw,1229229923871942198,1630927677,2021-09-06,/sites/siladityaray/2021/09/03/what-you-need-t...,article-delta-g/topline/default/standard,698,1,0.75,618.00,...,News and Politics,Politics,subscriber,windows,2021-09-06 11:27:57,0,6,9,11,27
129,pniguc5akqxivot,2275607360176688625,1616910751,2021-03-28,/sites/joewalsh/2021/03/27/us-couldve-decrease...,article/topline/default/standard,78,1,0.50,175.00,...,News and Politics,Politics,subscriber,windows,2021-03-28 05:52:31,6,28,3,5,52
199,pninisyjaqxa500,5417725234030748420,1628611333,2021-08-10,/sites/rachelsandler/2021/08/09/bill-gates-is-...,article/standard/nonsubscriber/alx,1,1,1.00,206.00,...,News and Politics,International News,subscriber,windows,2021-08-10 16:02:13,1,10,8,16,2
326,pnijhommyqy43tp,3450868243821864102,1630057800,2021-08-27,/sites/andrewsolender/2021/08/26/officer-who-s...,article/topline/subscriber/alx,232,1,0.50,27.00,...,News and Politics,,subscriber,android,2021-08-27 09:50:00,4,27,8,9,50
362,pnibhkxpiqy2umk,5244656747995337080,1629365818,2021-08-19,/sites/siladityaray/2021/08/19/death-toll-from...,article/topline/subscriber/alx,2,1,0.75,82.00,...,News and Politics,Disasters,subscriber,windows,2021-08-19 09:36:58,3,19,8,9,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1191855,,10283789826294898413,1635581217,2021-10-30,/sites/adamminsky/2021/09/14/parents-hope-for-...,article-amp/standard/default/standard,1,1,0.50,316.00,...,Personal Finance,Personal Debt,non_subscriber,ios,2021-10-30 08:06:57,5,30,10,8,6
1191989,,167620180902688305,1633808976,2021-10-09,/sites/adamminsky/2021/09/27/if-you-are-denied...,article-amp/standard/default/standard,2,1,0.00,,...,Personal Finance,Personal Debt,non_subscriber,ios,2021-10-09 19:49:36,5,9,10,19,49
1192141,,6751045111126250805,1633105035,2021-10-01,/sites/adamminsky/2021/09/29/calls-to-cancel-s...,article-amp/standard/default/standard,1,1,0.25,85.00,...,Personal Finance,Personal Debt,non_subscriber,ios,2021-10-01 16:17:15,4,1,10,16,17
1192255,,3611738427241498001,1633120461,2021-10-01,/sites/adamminsky/2021/09/30/biden-administrat...,article-amp/standard/default/standard,60,1,0.50,321.00,...,Personal Finance,Personal Debt,non_subscriber,ios,2021-10-01 20:34:21,4,1,10,20,34


In [97]:
ex = df[~df.GA_fullVisitorId.isin(ps.GA_fullVisitorId)]

ex[ex.GA_fullVisitorId =='4099334672618494031'].GA_primaryChannel
#ex

873        leadership
913905     leadership
3794048          none
3842694          home
5134274          home
Name: GA_primaryChannel, dtype: object

In [99]:
pc[pc.GA_fullVisitorId=='4099334672618494031']

GA_primaryChannel,GA_fullVisitorId,pc_advisor,pc_asia,pc_billionaires,pc_business,pc_consumer,pc_entrepreneurs,pc_forbes finds,pc_industry,pc_innovation,...,pc_leadership,pc_lifestyle,pc_money,pc_newsletters,pc_opinion,pc_real estate,pc_shopping,pc_small business,pc_tech,pc_video


In [101]:
cat[cat.GA_fullVisitorId =='4099334672618494031'].GA_primaryChannel

Series([], Name: GA_primaryChannel, dtype: object)

In [108]:
content[content.GA_fullVisitorId =='4099334672618494031'][['GA_primaryChannel', 'natid_start', 'tier1', 'tier2']]

Unnamed: 0,GA_primaryChannel,natid_start,tier1,tier2
873,leadership,blogandpostid,,
913905,leadership,blogandpostid,Careers,


In [107]:
cat[cat.GA_fullVisitorId =='4099334672618494031'] #[['GA_primaryChannel', 'natid_start']]

Unnamed: 0,GA_fullVisitorId,tier1,tier2,GA_primaryChannel,GA_primarySection,subscription_status,GA_pageViews,timeOnPage


In [102]:
df.shape

(7749079, 30)

In [103]:
content.shape

(5412219, 26)

In [88]:
ex = df[~df.GA_fullVisitorId.isin(t1.GA_fullVisitorId)]

ex[ex.GA_fullVisitorId =='4099334672618494031']
#ex

Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,tier1,tier2,subscription_status,deviceOS,session_time,dayofweek,day,month,hour,minute
873,pnir4xddsqvkzpm,4099334672618494031,1625582052,2021-07-06,/sites/jackkelly/2021/07/05/iceland-tried-a-sh...,article/standard/subscriber/alx,2,1,,22.0,...,,,subscriber,windows,2021-07-06 14:34:12,1,6,7,14,34
913905,pnir4xddsqvkzpm,4099334672618494031,1625608328,2021-07-06,/sites/carolinecenizalevine/2021/07/05/4-ways-...,article/standard/subscriber/alx,3,1,0.75,24.0,...,Careers,,subscriber,windows,2021-07-06 21:52:08,1,6,7,21,52
3794048,pnir4xddsqvkzpm,4099334672618494031,1625167952,2021-07-01,/subscribe,none,1,1,0.75,423.0,...,,,subscriber,windows,2021-07-01 19:32:32,3,1,7,19,32
3842694,pnir4xddsqvkzpm,4099334672618494031,1625167952,2021-07-01,/,none,1,1,0.0,3.0,...,,,subscriber,windows,2021-07-01 19:32:32,3,1,7,19,32
5134274,pnir4xddsqvkzpm,4099334672618494031,1625167952,2021-07-01,/,none,1,1,0.0,3.0,...,,,subscriber,windows,2021-07-01 19:32:32,3,1,7,19,32


In [89]:
t1

Unnamed: 0_level_0,GA_fullVisitorId,GA_pageViews,GA_pageViews,GA_pageViews,GA_pageViews,GA_pageViews,GA_pageViews,GA_pageViews,GA_pageViews,GA_pageViews,...,timeOnPage,timeOnPage,timeOnPage,timeOnPage,timeOnPage,timeOnPage,timeOnPage,timeOnPage,timeOnPage,timeOnPage
tier1,Unnamed: 1_level_1,Automotive,Books and Literature,Business and Finance,Careers,Content Channel,Education,Events and Attractions,Family and Relationships,Fine Art,...,Real Estate,Religion & Spirituality,Science,Shopping,Sports,Style & Fashion,Technology & Computing,Television,Travel,Video Gaming
0,10000145548747950113,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,454,0,0
1,10000150578546110250,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10000154461756058018,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10000244735129548321,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10000256049462665870,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398443,9999714828652623441,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
398444,9999771187527491562,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
398445,9999935617354687741,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,198,0,0,0
398446,9999969986945834961,0,0,0,0,0,0,0,0,0,...,0,0,527,0,0,0,1047,0,0,0


In [86]:
ex = df[~df.GA_fullVisitorId.isin(t1.GA_fullVisitorId)]

ex[ex.GA_fullVisitorId =='16462716003549521553']
#ex

Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,tier1,tier2,subscription_status,deviceOS,session_time,dayofweek,day,month,hour,minute
1042422,,16462716003549521553,1635708962,2021-10-31,/sites/bethbernstein/2019/03/24/six-easy-piece...,article-amp/standard/default/standard,2,1,0.75,120.0,...,,,non_subscriber,ios,2021-10-31 19:36:02,6,31,10,19,36
1190121,,16462716003549521553,1634195038,2021-10-14,/sites/adamminsky/2020/01/23/court-denies-bar-...,article-amp/standard/default/standard,1,1,0.25,29.0,...,,,non_subscriber,ios,2021-10-14 07:03:58,3,14,10,7,3


In [67]:
ex = pd.merge(ps, target_class, how="left", on="GA_fullVisitorId")
ex

Unnamed: 0,GA_fullVisitorId,ps_aerospace & defense,ps_ai,ps_arts,ps_banking & insurance,ps_careers,ps_cars & bikes,ps_cfo network,ps_cio network,ps_cloud,...,ps_small business strategy,ps_spirits,ps_sportsmoney,ps_taxes,ps_transportation,ps_travel,ps_venture capital,ps_vices,ps_wealth management,subscription_status
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,non_subscriber
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.33,0.00,0.00,0.00,0.00,0.00,non_subscriber
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,non_subscriber
3,10000257060996510203,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,non_subscriber
4,10000310598604879673,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,non_subscriber
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357030,9999600642589543142,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,non_subscriber
357031,9999611965095036174,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,non_subscriber
357032,9999771187527491562,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,non_subscriber
357033,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,non_subscriber


In [68]:
ex.subscription_status.value_counts()

non_subscriber    275443
subscriber         81592
Name: subscription_status, dtype: int64

In [None]:
# join target class
timeOnPage = pd.merge(timeOnPage, target_class, how="left", on = "GA_fullVisitorId")
timeOnPage