## Description:

This file contains:
* Reading in raw input data from SQL tables. Exhaustive list of features contained in this raw input data is mentioned below.

In [1]:
'''Helper'''
import pandas as pd
from functools import reduce
import numpy as np
import joblib
import datetime
import time
import re

'''GCS Utils'''
from gcs_utils import *

'''Display'''
import warnings
warnings.filterwarnings('ignore') 
pd.set_option('display.float_format', lambda x: '%.2f' % x)

'''BQ'''
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

In [2]:
def convert_time(time):
    return datetime.datetime.fromtimestamp(time).strftime('%Y-%m-%d %H:%M:%S')

### Data from SQL Tables

In [3]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_pred_ns_pool_ga`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(nonsubs_data.shape)

--- 12.062791585922241 seconds ---
(1196951, 22)


In [4]:
print("Total fvids in df: ", len(nonsubs_data.GA_fullVisitorId.unique()))

Total fvids in df:  600000


In [5]:
df = nonsubs_data.copy()
df.isna().sum()

piano_id                    1196951
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                     0
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth                26821
timeOnPage                   119538
GA_cmsNaturalId                   0
title                         15147
publish_date                  15147
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                        292026
tier2                        356764
dtype: int64

In [6]:
df.GA_referralGroup.value_counts()

organic search             634258
content aggregators        288380
direct                     245002
organic social (dark)       23010
referral                     4696
organic social (forbes)       831
paid search                   577
newsletter                    159
paid web                       37
fbia                            1
Name: GA_referralGroup, dtype: int64

* **Imputation**
    * time on page = 0
    * natid = none
    * Tier1, Tier2 missing/empty = other
    * PC, PS, Country, Device OS shortlisted. Rest = other

In [7]:
# fill NAs
df.GA_cmsNaturalId = df.GA_cmsNaturalId.fillna('none') 

# fill NAs
df.timeOnPage = df.timeOnPage.fillna(0)

In [8]:
# short list prim channel. NOTE - 'none' not included below 
shortlisted_channel = ['business', 'leadership', 'money', 'innovation', 'lifestyle', 'home', 
                       'billionaires', 'small business', 'consumer', 'shopping', 'industry', 'investing', 
                       'tech', 'entrepreneurs', 'newsletters', 'asia', 'opinion', 'real estate', 'lists', 
                       'forbes finds', 'under 30', 'advisor', 'video']

# short list prim section. NOTE - 'none' not included below 
shortlisted_section = ['careers', 'personal finance', 'markets', 'forbeswomen', 'leadership strategy', 'healthcare', 'travel', 'sportsmoney', 'retail', 'entrepreneurs', 'science', 
                        'taxes', 'policy', 'consumer tech', 'investing', 'retirement', 'education', 'cmo network', 'real estate', 'hollywood & entertainment', 'cybersecurity', 
                        'aerospace & defense', 'diversity & inclusion', 'energy', 'food & drink', 'enterprise & cloud', 'enterprise tech', 'transportation', 'crypto & blockchain', 'games', 
                        'money & politics', 'media', 'fintech', 'venture capital', 'forbeslife', 'vices', 'manufacturing', 'small business strategy', 'hedge funds & private equity', 
                        'arts', 'ai', 'cio network', 'cars & bikes', 'banking & insurance', 'cfo network', 'spirits', 'cloud', 'dining', 'confirmation', 'wealth management']

shortlisted_os = ["android", "ios", "macintosh", "windows"]

shortlisted_country = ['united states', 'russia', 'canada', 'united kingdom', 'japan', 'australia', 'india', 'singapore', 'germany', 'philippines']

In [9]:
# replace NULL and empty with "other"
df.tier1 = df.tier1.replace(r'^\s*$', "other", regex=True)
df.tier1 = df.tier1.fillna("other")

# replace empty and NULL with "other"
df.tier2 = df.tier2.replace(r'^\s*$', "other", regex=True)
df.tier2 = df.tier2.fillna("other")

df["GA_primaryChannel"] = np.where(df["GA_primaryChannel"].isin(shortlisted_channel), df["GA_primaryChannel"], "other")

df["GA_primarySection"] = np.where(df["GA_primarySection"].isin(shortlisted_section), df["GA_primarySection"], "other")

df["GA_deviceOperatingSystem"] = np.where(df["GA_deviceOperatingSystem"].isin(shortlisted_os), df["GA_deviceOperatingSystem"], "other")

df["GA_country"] = np.where(df["GA_country"].isin(shortlisted_country), df["GA_country"], "other")

In [10]:
# after imputation

df.isna().sum()

piano_id                    1196951
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                     0
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth                26821
timeOnPage                        0
GA_cmsNaturalId                   0
title                         15147
publish_date                  15147
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                             0
tier2                             0
dtype: int64

### Training Data - Features

**Numerical Features**
1. User metrics
    * unique pageviews per session (avg, median)
    * timeOnPage per session (avg, median) <br><br>
2. Session metrics
    * bounceRate  <br><br>
3. Content metrics
    * contentViewsRate (contentViews / pageViews) <br>

**Categorical Features**
1. Content categories 
    * Sum pvs - Tier1, Tier2, Primary Channel, Primary Section
    * Avg. top - Tier1 <br><br>
2. Timestamp features: 

    * Weekday vs Weekend 
        * Sum pvs & avg. top
    * Business Hours vs Non-Business Hours *(Differentiated by US and other countries)*
        * Sum pvs & avg. top
    * Day of week
        * avg. top
    * Day of month
        * avg. top
    * Hourly *(Differentiated by US and other countries)*
        * avg. top
    * Minute *(Differentiated by US and other countries)*
        * avg. top
    * Month?? -- check with rob: may not be available at prediction time<br><br>
3. Device OS - Sum pvs
4. Referral source - Sum pvs
5. Countries - Sum pvs<br><br>

NOTE: Keeping sum as the aggregation metric for pvs in categorical features for now so that during modeling different treatments can be tried like Percentage of pvs or avg. pvs or one-hot encoding
<br>

**Extra features for v2:**
* Avg. monthly article views 
* Avg. top per person - Tier2
* Timestamp features: (Percentage of pvs) -- 
    * Hourly
    * Day of week
    * Month
    * Day of month
    * Minute  

In [12]:
# user's per pagepath GA data
page = df.groupby(['GA_fullVisitorId', 'GA_visitStartTime', 'GA_pagePath']).agg({'GA_pageViews': 'max', 'timeOnPage': 'sum'}).reset_index()

# user's per session GA data
session = page.groupby(['GA_fullVisitorId', 'GA_visitStartTime']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'mean'}).reset_index()

session.rename(columns={'GA_pageViews': 'session_pvs', 'timeOnPage': 'session_top'}, inplace=True)

* **Feature 1: Users' Unique Pageviews in each session (avg, median)**

In [14]:
pvs = session.groupby('GA_fullVisitorId').agg({'session_pvs': ['mean', 'median']}).reset_index()

# rename cols
pvs.columns = [' '.join(col).strip() for col in pvs.columns.values]
pvs.rename(columns={'session_pvs mean':'session_pvs_mean', 'session_pvs median': 'session_pvs_median'}, inplace=True)
pvs

Unnamed: 0,GA_fullVisitorId,session_pvs_mean,session_pvs_median
0,10000033615091336580,1.00,1.00
1,10000055443197339256,1.00,1.00
2,10000060100047973466,1.00,1.00
3,10000103435832780646,1.00,1.00
4,10000176091294524415,1.00,1.00
...,...,...,...
599995,9999685864719983756,1.00,1.00
599996,999978743196362429,1.00,1.00
599997,9999806071018918881,1.00,1.00
599998,9999855144075277024,1.00,1.00


* **Feature 2:  Users' Time on Page in each session (avg, median)**

In [15]:
top = session.groupby('GA_fullVisitorId').agg({'session_top': ['mean', 'median']}).reset_index()

# rename cols
top.columns = [' '.join(col).strip() for col in top.columns.values]
top.rename(columns={'session_top mean':'session_top_mean', 'session_top median': 'session_top_median'}, inplace=True)

top

Unnamed: 0,GA_fullVisitorId,session_top_mean,session_top_median
0,10000033615091336580,62.00,62.00
1,10000055443197339256,13.00,13.00
2,10000060100047973466,311.00,311.00
3,10000103435832780646,124.00,124.00
4,10000176091294524415,0.00,0.00
...,...,...,...
599995,9999685864719983756,161.00,161.00
599996,999978743196362429,6.00,6.00
599997,9999806071018918881,80.00,80.00
599998,9999855144075277024,42.00,42.00


In [16]:
def sum_pvs(input_df, cat_col_name):
    
    # pivot on user
    df_cat = pd.pivot_table(
        input_df, 
        values='GA_pageViews',
        columns= cat_col_name,
        index='GA_fullVisitorId',
        aggfunc='sum')\
        .reset_index()
    
    # fillna
    df_cat = df_cat.fillna(0)

    return df_cat

In [17]:
def calc_top(input_df, cat_col_name):
    
    # pivot on user
    df_cat = pd.pivot_table(
        input_df, 
        values=['timeOnPage', 'GA_pageViews'],
        columns= cat_col_name,
        index='GA_fullVisitorId',
        aggfunc='sum')\
        .reset_index()
    
    # set aside fvids
    fvids = list(df_cat.GA_fullVisitorId)

    # calc avg. top
    df_top = df_cat["timeOnPage"]/df_cat["GA_pageViews"]
    
    # fillna
    df_top = df_top.fillna(0)

    # join fvids
    df_top["GA_fullVisitorId"] = fvids
    
    return df_top

* **Feature 3: Referral sources - sum pvs**

In [18]:
# sum pvs of user in referral categories
ref = sum_pvs(df, 'GA_referralGroup')

# rename
ref.columns = ref.columns.map(lambda x : 'rf_'+x if x !='GA_fullVisitorId' else x)

ref

GA_referralGroup,GA_fullVisitorId,rf_content aggregators,rf_direct,rf_fbia,rf_newsletter,rf_organic search,rf_organic social (dark),rf_organic social (forbes),rf_paid search,rf_paid web,rf_referral
0,10000033615091336580,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
1,10000055443197339256,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
2,10000060100047973466,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000103435832780646,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
4,10000176091294524415,0.00,2.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...
599995,9999685864719983756,0.00,0.00,0.00,0.00,2.00,0.00,0.00,0.00,0.00,0.00
599996,999978743196362429,2.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
599997,9999806071018918881,0.00,0.00,0.00,0.00,2.00,0.00,0.00,0.00,0.00,0.00
599998,9999855144075277024,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00


* **Feature 4: Country - sum pvs**

In [19]:
subs_top_ct = ['united states', 'russia', 'canada', 'united kingdom', 'japan']
nonsubs_top_ct = ['australia', 'india', 'singapore', 'germany', 'philippines']

shortlisted_countries = subs_top_ct + nonsubs_top_ct

In [20]:
# per user, sum pvs
country = sum_pvs(df, 'GA_country')

country.columns = country.columns.map(lambda x : 'ct_'+x if x !='GA_fullVisitorId' else x)
country

GA_country,GA_fullVisitorId,ct_australia,ct_canada,ct_germany,ct_india,ct_japan,ct_other,ct_philippines,ct_russia,ct_singapore,ct_united kingdom,ct_united states
0,10000033615091336580,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000055443197339256,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
2,10000060100047973466,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
3,10000103435832780646,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
4,10000176091294524415,0.00,0.00,0.00,2.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
599995,9999685864719983756,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2.00
599996,999978743196362429,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2.00
599997,9999806071018918881,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2.00
599998,9999855144075277024,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00


* **Feature 5: Device OS - sum pvs**

In [21]:
# per user,sum pvs

device_os = sum_pvs(df, 'GA_deviceOperatingSystem')

device_os.columns = device_os.columns.map(lambda x : 'dos_'+x if x !='GA_fullVisitorId' else x)

device_os

GA_deviceOperatingSystem,GA_fullVisitorId,dos_android,dos_ios,dos_macintosh,dos_other,dos_windows
0,10000033615091336580,0.00,1.00,0.00,0.00,0.00
1,10000055443197339256,0.00,1.00,0.00,0.00,0.00
2,10000060100047973466,1.00,0.00,0.00,0.00,0.00
3,10000103435832780646,1.00,0.00,0.00,0.00,0.00
4,10000176091294524415,2.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...
599995,9999685864719983756,2.00,0.00,0.00,0.00,0.00
599996,999978743196362429,2.00,0.00,0.00,0.00,0.00
599997,9999806071018918881,2.00,0.00,0.00,0.00,0.00
599998,9999855144075277024,1.00,0.00,0.00,0.00,0.00


### Content categories
* Content categories = IAB Tier 1, Tier 2, PC (shortlisted), PS (shortlisted)

* **Feature 6: Tier 1 - sum pvs**

In [22]:
print("Unique Tier 1: ", len(df.tier1.unique())) 

# per user, sum pvs only in above non-null tier1s
t1 = sum_pvs(df, 'tier1')

t1.columns = t1.columns.map(lambda x : 't1_'+ str(x) +'_pvs' if x !='GA_fullVisitorId' else x)
t1

Unique Tier 1:  31


tier1,GA_fullVisitorId,t1_Automotive_pvs,t1_Books and Literature_pvs,t1_Business and Finance_pvs,t1_Careers_pvs,t1_Content Channel_pvs,t1_Education_pvs,t1_Events and Attractions_pvs,t1_Family and Relationships_pvs,t1_Fine Art_pvs,...,t1_Religion & Spirituality_pvs,t1_Science_pvs,t1_Shopping_pvs,t1_Sports_pvs,t1_Style & Fashion_pvs,t1_Technology & Computing_pvs,t1_Television_pvs,t1_Travel_pvs,t1_Video Gaming_pvs,t1_other_pvs
0,10000033615091336580,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000055443197339256,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000060100047973466,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000103435832780646,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000176091294524415,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,9999685864719983756,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,1.00
599996,999978743196362429,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,1.00
599997,9999806071018918881,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00
599998,9999855144075277024,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 7: Tier 2 - sum pvs**

In [23]:
print("Unique Tier 2: ", len(df.tier2.unique())) 

# per user, sum pvs only in above non-null tier1s
t2 = sum_pvs(df, 'tier2')

t2.columns = t2.columns.map(lambda x : 't2_'+ str(x) +'_pvs' if x !='GA_fullVisitorId' else x)
t2

Unique Tier 2:  309


tier2,GA_fullVisitorId,t2_Action and Adventure Movies_pvs,t2_Adult Education_pvs,t2_Alcoholic Beverages_pvs,t2_Alternative Music_pvs,t2_American Football_pvs,t2_Amusement and Theme Parks_pvs,t2_Animation Movies_pvs,t2_Animation TV_pvs,t2_Apartments_pvs,...,t2_Wellness_pvs,t2_Women's Fashion_pvs,t2_Women's Health_pvs,t2_World Cuisines_pvs,t2_World Movies_pvs,t2_Wrestling_pvs,t2_Young Adult Literature_pvs,t2_Zoos & Aquariums_pvs,t2_eSports_pvs,t2_other_pvs
0,10000033615091336580,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000055443197339256,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000060100047973466,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000103435832780646,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000176091294524415,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,9999685864719983756,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
599996,999978743196362429,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
599997,9999806071018918881,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
599998,9999855144075277024,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00


* **Feature 8: Prim Channel- sum pvs**

In [24]:
pc = sum_pvs(df, 'GA_primaryChannel')

pc.columns = pc.columns.map(lambda x : 'pc_'+ str(x) + '_pvs' if x !='GA_fullVisitorId' else x)
pc

GA_primaryChannel,GA_fullVisitorId,pc_advisor_pvs,pc_asia_pvs,pc_billionaires_pvs,pc_business_pvs,pc_consumer_pvs,pc_entrepreneurs_pvs,pc_forbes finds_pvs,pc_home_pvs,pc_industry_pvs,...,pc_lists_pvs,pc_money_pvs,pc_newsletters_pvs,pc_opinion_pvs,pc_other_pvs,pc_real estate_pvs,pc_small business_pvs,pc_tech_pvs,pc_under 30_pvs,pc_video_pvs
0,10000033615091336580,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000055443197339256,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000060100047973466,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000103435832780646,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000176091294524415,0.00,2.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,9999685864719983756,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
599996,999978743196362429,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
599997,9999806071018918881,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
599998,9999855144075277024,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00


* **Feature 9: Prim Section - sum pvs**

In [25]:
ps = sum_pvs(df, 'GA_primarySection')

ps.columns = ps.columns.map(lambda x : 'ps_'+ str(x) + '_pvs' if x !='GA_fullVisitorId' else x)
ps

GA_primarySection,GA_fullVisitorId,ps_aerospace & defense_pvs,ps_ai_pvs,ps_arts_pvs,ps_banking & insurance_pvs,ps_careers_pvs,ps_cars & bikes_pvs,ps_cfo network_pvs,ps_cio network_pvs,ps_cloud_pvs,...,ps_science_pvs,ps_small business strategy_pvs,ps_spirits_pvs,ps_sportsmoney_pvs,ps_taxes_pvs,ps_transportation_pvs,ps_travel_pvs,ps_venture capital_pvs,ps_vices_pvs,ps_wealth management_pvs
0,10000033615091336580,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000055443197339256,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000060100047973466,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000103435832780646,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000176091294524415,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,9999685864719983756,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
599996,999978743196362429,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
599997,9999806071018918881,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00
599998,9999855144075277024,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 10: Tier 1 - Avg. time on page**

In [26]:
t1_top = calc_top(df, 'tier1')

# rename
t1_top.columns = t1_top.columns.map(lambda x : 't1_'+ str(x) + '_top' if x !='GA_fullVisitorId' else x)

t1_top

tier1,t1_Automotive_top,t1_Books and Literature_top,t1_Business and Finance_top,t1_Careers_top,t1_Content Channel_top,t1_Education_top,t1_Events and Attractions_top,t1_Family and Relationships_top,t1_Fine Art_top,t1_Food & Drink_top,...,t1_Science_top,t1_Shopping_top,t1_Sports_top,t1_Style & Fashion_top,t1_Technology & Computing_top,t1_Television_top,t1_Travel_top,t1_Video Gaming_top,t1_other_top,GA_fullVisitorId
0,62.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000033615091336580
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000055443197339256
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000060100047973466
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000103435832780646
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000176091294524415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,40.00,0.00,0.00,0.00,282.00,9999685864719983756
599996,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,12.00,0.00,0.00,0.00,0.00,999978743196362429
599997,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,82.00,0.00,0.00,9999806071018918881
599998,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999855144075277024


* **Feature 11: Bounce rate** 

In [27]:
def b_rate(g):
    '''for each fvid: calculate percentage of sessions comprising only 1PV'''
    
    # count sessions w/ pv = 1
    sessions_w_1pv = g[g['session_pvs']==1].shape[0]
    
    # count total sessions
    total_sessions = g.shape[0]
    
    # calculate ratio
    return (sessions_w_1pv)/total_sessions

br = pd.DataFrame(session.groupby('GA_fullVisitorId').apply(lambda x: b_rate(x))).reset_index().rename(columns={0:'bounce_rate'}) # takes 3mins
br

Unnamed: 0,GA_fullVisitorId,bounce_rate
0,10000033615091336580,1.00
1,10000055443197339256,1.00
2,10000060100047973466,1.00
3,10000103435832780646,1.00
4,10000176091294524415,1.00
...,...,...
599995,9999685864719983756,1.00
599996,999978743196362429,1.00
599997,9999806071018918881,1.00
599998,9999855144075277024,1.00


* **Feature 12: Content views rate**

In [28]:
natid_page_map = df[['GA_pagePath', 'GA_cmsNaturalId', 'publish_date']].sort_values('publish_date', ascending=False).drop_duplicates('GA_pagePath')

# join page path with their natids
page = pd.merge(page, 
                natid_page_map, 
                how="left", 
                on="GA_pagePath")

In [29]:
def c_views_rate(g):
    '''for each fvid: calculate percentage PVs that are actually views on content pages; vs non-content pages such as the home page, channel/section landing pages, author pages, etc.'''
    
    # sum pvs on actual content for user
    content_sum_pv = g[g.GA_cmsNaturalId.str.contains("blogandpostid|blogandslideid|galleryid|video")].GA_pageViews.sum()
    
    # sum all pvs for user
    total_pv = g.GA_pageViews.sum()
    
    # calculate ratio
    return (content_sum_pv)/total_pv


cvr = pd.DataFrame(page.groupby('GA_fullVisitorId').apply(lambda x: c_views_rate(x))).reset_index().rename(columns={0:'content_views_rate'}) # takes 5mins
cvr

Unnamed: 0,GA_fullVisitorId,content_views_rate
0,10000033615091336580,1.00
1,10000055443197339256,1.00
2,10000060100047973466,1.00
3,10000103435832780646,1.00
4,10000176091294524415,1.00
...,...,...
599995,9999685864719983756,1.00
599996,999978743196362429,1.00
599997,9999806071018918881,1.00
599998,9999855144075277024,1.00


### Timestamp features

In [30]:
df['GA_date']=pd.to_datetime(df['GA_date'],errors='coerce')

df['dayofweek'] = df.GA_date.dt.day_name()

week_dict = {True: "weekday", False: "weekend"}

df['weekday'] = ((df.GA_date.dt.dayofweek)// 5 != 1).astype("category")
df['weekday'] = df['weekday'].map(week_dict)

df['day'] = df.GA_date.dt.day
df['month'] = df.GA_date.dt.month

# time
df['session_time']=df['GA_visitStartTime'].apply(convert_time)

df['session_time']=pd.to_datetime(df['session_time'],errors='coerce')

df['est_time']=df['session_time'].dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

# for US, calculate hour from EST conversion. Else calculate hour from GMT
df['hour']=np.where((df['GA_country']=="united states"),
                    df.est_time.dt.hour,
                    df.session_time.dt.hour)

df['minute']=np.where((df['GA_country']=="united states"),
                    df.est_time.dt.minute,
                      df.session_time.dt.minute)

df['business_hours']= np.where((df['hour']>=8) & (df['hour']<18), 'business_hours', 'non_business_hours')

* **Feature 13:  Weekend/Weekday - Sum pvs**

In [31]:
# per user, sum pvs in categories
wk_df = sum_pvs(df, 'weekday')

wk_df.columns = wk_df.columns.map(lambda x : str(x) +'_pvs' if x !='GA_fullVisitorId' else x)
wk_df

weekday,GA_fullVisitorId,weekend_pvs,weekday_pvs
0,10000033615091336580,0,1
1,10000055443197339256,0,1
2,10000060100047973466,0,1
3,10000103435832780646,1,0
4,10000176091294524415,2,0
...,...,...,...
599995,9999685864719983756,0,2
599996,999978743196362429,0,2
599997,9999806071018918881,2,0
599998,9999855144075277024,0,1


* **Feature 14:  Weekend/Weekday - Avg. top**

In [32]:
wk_top = calc_top(df, 'weekday')

# rename
wk_top.columns = wk_top.columns.map(lambda x : str(x) + '_top' if x !='GA_fullVisitorId' else x)

wk_top

weekday,weekend_top,weekday_top,GA_fullVisitorId
0,0.00,62.00,10000033615091336580
1,0.00,13.00,10000055443197339256
2,0.00,311.00,10000060100047973466
3,124.00,0.00,10000103435832780646
4,0.00,0.00,10000176091294524415
...,...,...,...
599995,0.00,161.00,9999685864719983756
599996,0.00,6.00,999978743196362429
599997,80.00,0.00,9999806071018918881
599998,0.00,42.00,9999855144075277024


* **Feature 15: Busi vs Non-Busi hours - Sum pvs**

In [33]:
# per user, sum pvs in categories
busi_df = sum_pvs(df, 'business_hours')

busi_df.columns = busi_df.columns.map(lambda x : str(x) +'_pvs' if x !='GA_fullVisitorId' else x)
busi_df

business_hours,GA_fullVisitorId,business_hours_pvs,non_business_hours_pvs
0,10000033615091336580,0.00,1.00
1,10000055443197339256,0.00,1.00
2,10000060100047973466,0.00,1.00
3,10000103435832780646,0.00,1.00
4,10000176091294524415,2.00,0.00
...,...,...,...
599995,9999685864719983756,1.00,1.00
599996,999978743196362429,2.00,0.00
599997,9999806071018918881,1.00,1.00
599998,9999855144075277024,0.00,1.00


* **Feature 16: Busi vs Non-Busi hours - Avg. top**

In [34]:
busi_top = calc_top(df, 'business_hours')

# rename
busi_top.columns = busi_top.columns.map(lambda x : str(x) + '_top' if x !='GA_fullVisitorId' else x)

busi_top

business_hours,business_hours_top,non_business_hours_top,GA_fullVisitorId
0,0.00,62.00,10000033615091336580
1,0.00,13.00,10000055443197339256
2,0.00,311.00,10000060100047973466
3,0.00,124.00,10000103435832780646
4,0.00,0.00,10000176091294524415
...,...,...,...
599995,282.00,40.00,9999685864719983756
599996,6.00,0.00,999978743196362429
599997,82.00,78.00,9999806071018918881
599998,0.00,42.00,9999855144075277024


* **Feature 14:  Day of week - Avg. top**

In [35]:
dow_top = calc_top(df, 'dayofweek')

# rename
dow_top.columns = dow_top.columns.map(lambda x : str(x) + '_top' if x !='GA_fullVisitorId' else x)

dow_top

dayofweek,Friday_top,Monday_top,Saturday_top,Sunday_top,Thursday_top,Tuesday_top,Wednesday_top,GA_fullVisitorId
0,0.00,62.00,0.00,0.00,0.00,0.00,0.00,10000033615091336580
1,0.00,13.00,0.00,0.00,0.00,0.00,0.00,10000055443197339256
2,0.00,311.00,0.00,0.00,0.00,0.00,0.00,10000060100047973466
3,0.00,0.00,0.00,124.00,0.00,0.00,0.00,10000103435832780646
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000176091294524415
...,...,...,...,...,...,...,...,...
599995,282.00,0.00,0.00,0.00,0.00,0.00,40.00,9999685864719983756
599996,12.00,0.00,0.00,0.00,0.00,0.00,0.00,999978743196362429
599997,0.00,0.00,80.00,0.00,0.00,0.00,0.00,9999806071018918881
599998,0.00,0.00,0.00,0.00,42.00,0.00,0.00,9999855144075277024


* **Feature 15: Day of month - Avg. top**

In [36]:
dom_top = calc_top(df, 'day')

# rename
dom_top.columns = dom_top.columns.map(lambda x : 'day_of_mon_' + str(x) + '_top' if x !='GA_fullVisitorId' else x)

dom_top

day,day_of_mon_1_top,day_of_mon_2_top,day_of_mon_3_top,day_of_mon_4_top,day_of_mon_5_top,day_of_mon_6_top,day_of_mon_7_top,day_of_mon_8_top,day_of_mon_9_top,day_of_mon_10_top,...,day_of_mon_23_top,day_of_mon_24_top,day_of_mon_25_top,day_of_mon_26_top,day_of_mon_27_top,day_of_mon_28_top,day_of_mon_29_top,day_of_mon_30_top,day_of_mon_31_top,GA_fullVisitorId
0,62.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000033615091336580
1,13.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000055443197339256
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000060100047973466
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000103435832780646
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000176091294524415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,161.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999685864719983756
599996,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,12.00,0.00,0.00,0.00,0.00,0.00,999978743196362429
599997,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,78.00,0.00,0.00,0.00,0.00,9999806071018918881
599998,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999855144075277024


* **Feature 16: Hourly - Avg. top**
    * Here for example when country = US, hour 9 is 9 EST and for non-US countries, hour 9 is 9 GMT

In [37]:
hour_top = calc_top(df, 'hour')

# rename
hour_top.columns = hour_top.columns.map(lambda x : 'hour_' + str(x) + '_top' if x !='GA_fullVisitorId' else x)

hour_top

hour,hour_0_top,hour_1_top,hour_2_top,hour_3_top,hour_4_top,hour_5_top,hour_6_top,hour_7_top,hour_8_top,hour_9_top,...,hour_15_top,hour_16_top,hour_17_top,hour_18_top,hour_19_top,hour_20_top,hour_21_top,hour_22_top,hour_23_top,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,62.00,10000033615091336580
1,13.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000055443197339256
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,311.00,0.00,0.00,0.00,0.00,10000060100047973466
3,0.00,0.00,0.00,0.00,0.00,124.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000103435832780646
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000176091294524415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,40.00,0.00,0.00,0.00,0.00,9999685864719983756
599996,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,999978743196362429
599997,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,78.00,9999806071018918881
599998,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,42.00,0.00,0.00,0.00,0.00,0.00,9999855144075277024


* **Feature 17: Minute -  avg. top**

In [38]:
minute_top = calc_top(df, 'minute')

# rename
minute_top.columns = minute_top.columns.map(lambda x : 'minute_' + str(x) + '_top' if x !='GA_fullVisitorId' else x)

minute_top

minute,minute_0_top,minute_1_top,minute_2_top,minute_3_top,minute_4_top,minute_5_top,minute_6_top,minute_7_top,minute_8_top,minute_9_top,...,minute_51_top,minute_52_top,minute_53_top,minute_54_top,minute_55_top,minute_56_top,minute_57_top,minute_58_top,minute_59_top,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000033615091336580
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,13.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000055443197339256
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,311.00,0.00,0.00,10000060100047973466
3,0.00,0.00,0.00,0.00,0.00,124.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000103435832780646
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000176091294524415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999685864719983756
599996,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,12.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,999978743196362429
599997,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,82.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999806071018918881
599998,0.00,0.00,42.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999855144075277024


### Combine all features in 1 dataframe

In [41]:
inner_join_list = [pvs, top, br, cvr, ref, country, device_os, t1, t2, pc, ps, t1_top, wk_df, wk_top, busi_df, busi_top, dow_top, dom_top, hour_top, minute_top]

final_df = reduce(lambda x, y: pd.merge(x, y, on = 'GA_fullVisitorId', how="inner"), inner_join_list)

final_df.shape

(600000, 605)

In [42]:
# clean column names - can only contain letters and underscores

col_names = final_df.columns
col_names = [re.sub('[^A-Za-z0-9]+', ' ', x.replace("&", "and").replace("'s", "")).strip().replace(" ", "_").lower() for x in col_names]
final_df.columns = col_names

In [43]:
print("Any nulls?: ", final_df.isna().sum().unique())

final_df.tail()

Any nulls?:  [0]


Unnamed: 0,ga_fullvisitorid,session_pvs_mean,session_pvs_median,session_top_mean,session_top_median,bounce_rate,content_views_rate,rf_content_aggregators,rf_direct,rf_fbia,...,minute_50_top,minute_51_top,minute_52_top,minute_53_top,minute_54_top,minute_55_top,minute_56_top,minute_57_top,minute_58_top,minute_59_top
599995,9999685864719983756,1.0,1.0,161.0,161.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
599996,999978743196362429,1.0,1.0,6.0,6.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
599997,9999806071018918881,1.0,1.0,80.0,80.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
599998,9999855144075277024,1.0,1.0,42.0,42.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
599999,9999889568795611075,1.0,1.0,13.0,13.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
print("Total users in input df:", len(df.GA_fullVisitorId.unique()))
print("Total users in train df:", len(final_df.ga_fullvisitorid.unique()))

Total users in input df: 600000
Total users in train df: 600000


In [45]:
final_df.to_csv("pred_ns_data_12012021.csv", index=False)

In [2]:
final_df = pd.read_csv("pred_ns_data_12012021.csv")

In [3]:
print("Any NAs?: ", final_df.columns[final_df.isna().any()])

print("Shape:", final_df.shape)
print()

Any NAs?:  Index([], dtype='object')
Shape: (600000, 605)



In [4]:
BUCKET_NAME = 'bi-subscription-modeling'
BUCKET_FOLDER = 'train-val-data'

# upload .csv 
file_name = 'pred_ns_data_12012021.csv'

upload_csv_file(final_df, 
                BUCKET_NAME, 
                BUCKET_FOLDER, 
                file_name) 

File uploaded to gs://bi-subscription-modeling/train-val-data/pred_ns_data_12012021.csv


In [5]:
final_df

Unnamed: 0,ga_fullvisitorid,session_pvs_mean,session_pvs_median,session_top_mean,session_top_median,bounce_rate,content_views_rate,rf_content_aggregators,rf_direct,rf_fbia,...,minute_50_top,minute_51_top,minute_52_top,minute_53_top,minute_54_top,minute_55_top,minute_56_top,minute_57_top,minute_58_top,minute_59_top
0,10000033615091337216.00,1.00,1.00,62.00,62.00,1.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000055443197339648.00,1.00,1.00,13.00,13.00,1.00,1.00,0.00,0.00,0.00,...,0.00,13.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000060100047974400.00,1.00,1.00,311.00,311.00,1.00,1.00,0.00,1.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,311.00,0.00,0.00
3,10000103435832780800.00,1.00,1.00,124.00,124.00,1.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000176091294523392.00,1.00,1.00,0.00,0.00,1.00,1.00,0.00,2.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,9999685864719984640.00,1.00,1.00,161.00,161.00,1.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
599996,999978743196362368.00,1.00,1.00,6.00,6.00,1.00,1.00,2.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
599997,9999806071018917888.00,1.00,1.00,80.00,80.00,1.00,1.00,0.00,0.00,0.00,...,0.00,0.00,82.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
599998,9999855144075276288.00,1.00,1.00,42.00,42.00,1.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [7]:
final_df[final_df.session_top_mean!=final_df.session_top_median]

Unnamed: 0,ga_fullvisitorid,session_pvs_mean,session_pvs_median,session_top_mean,session_top_median,bounce_rate,content_views_rate,rf_content_aggregators,rf_direct,rf_fbia,...,minute_50_top,minute_51_top,minute_52_top,minute_53_top,minute_54_top,minute_55_top,minute_56_top,minute_57_top,minute_58_top,minute_59_top
6,10000245216351053824.00,1.00,1.00,24.00,8.50,1.00,1.00,0.00,0.00,0.00,...,0.00,99.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
12,10000329724181737472.00,1.00,1.00,542.33,98.00,1.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
15,10000442277174104064.00,1.00,1.00,46.33,39.00,1.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
20,10000551368453156864.00,1.00,1.00,4.78,0.00,1.00,1.00,0.00,9.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
26,10000655588971749376.00,1.00,1.00,137.67,77.00,1.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599964,9998893156316645376.00,1.00,1.00,72.92,2.00,1.00,1.00,0.00,7.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,48.50,0.00,0.00,0.00
599967,9998931966311092224.00,1.00,1.00,16.33,24.00,1.00,1.00,3.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
599990,999948398189211008.00,1.00,1.00,42.00,1.00,1.00,1.00,0.00,3.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
599991,9999558994162692096.00,1.00,1.00,109.00,45.00,1.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
