In [None]:
"""Analytics Reporting API V4."""
import numpy as np

import argparse
from datetime import date, datetime

from apiclient.discovery import build
import httplib2
from oauth2client import client
from oauth2client import file
from oauth2client import tools
from tqdm import tqdm
import pylab
import matplotlib.pyplot as plt
import pandas as pd

import numpy as np


DIMS = ['ga:clientId', 'ga:deviceCategory', 'ga:userType', 'ga:daysSinceLastSession', 'ga:date']
METRICS = ['ga:hits', 'ga:avgSessionDuration', 'ga:pageviews', 'ga:uniquePageviews']

START_DATE = "2020-09-01"
END_DATE = "2021-11-20"
N = 50 # TRESHOLD OF PAGEVIEWS - DEPENDENT ON TIMEFRAME

SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
CLIENT_SECRETS_PATH = '' # Path to client_secrets.json file.
VIEW_ID = '' # PUT IN OWN GOOGLE ANALYTICS VIEW ID


def initialize_analyticsreporting():
    """
    Initializes the analyticsreporting service object.

    Returns:
    analytics an authorized analyticsreporting service object.
    """
    # Parse command-line arguments.
    parser = argparse.ArgumentParser(
      formatter_class=argparse.RawDescriptionHelpFormatter,
      parents=[tools.argparser])
    flags = parser.parse_args([])

    # Set up a Flow object to be used if we need to authenticate.
    flow = client.flow_from_clientsecrets(
      CLIENT_SECRETS_PATH, scope=SCOPES,
      message=tools.message_if_missing(CLIENT_SECRETS_PATH))

    # Prepare credentials, and authorize HTTP object with them.
    # If the credentials don't exist or are invalid run through the native client
    # flow. The Storage object will ensure that if successful the good
    # credentials will get written back to a file.
    storage = file.Storage('analyticsreporting.dat')
    credentials = storage.get()
    if credentials is None or credentials.invalid:
        credentials = tools.run_flow(flow, storage, flags)
    http = credentials.authorize(http=httplib2.Http())

    # Build the service object.
    analytics = build('analyticsreporting', 'v4', http=http)

    return analytics

def get_report(analytics, dims, metrics):
  # Use the Analytics Service Object to query the Analytics Reporting API V4.
    requests_list =  [{
            'viewId': VIEW_ID,
            'dateRanges': [{'startDate': START_DATE, 'endDate': END_DATE}],
            'dimensions': [{'name': name} for name in dims],
            'metrics': [{'expression': exp} for exp in metrics],
        "samplingLevel": "LARGE",
        "pageSize": 100000
    }]
    return analytics.reports().batchGet(body={'reportRequests':requests_list }).execute()

def report_dataframe(response, dims, metrics):
    data_dic = {f"{i}": [] for i in dims + metrics}
    for report in response.get('reports', []):
        rows = report.get('data', {}).get('rows', [])
        for row in rows:
            for i, key in enumerate(dims):
                data_dic[key].append(row.get('dimensions', [])[i]) # Get dimensions
            dateRangeValues = row.get('metrics', [])
            for values in dateRangeValues:
                all_values = values.get('values', []) # Get metric values
                for i, key in enumerate(metrics):
                    data_dic[key].append(all_values[i])

    df = pd.DataFrame(data=data_dic)
    df.columns = [col.split(':')[-1] for col in df.columns]
    return df


def report_clientid(analytics, user_ids):
  # Use the Analytics Service Object to query the Analytics Reporting API V4.
    report = []
    
    for user_id in tqdm(user_ids):
        try: 
            data = analytics.userActivity().search(body={
              "viewId": VIEW_ID,
              "dateRange": {
                 "startDate": START_DATE,
                 "endDate": END_DATE
              },
               "user": {
                 "type": "CLIENT_ID",
                 "userId": user_id
              }
            }).execute()
        except: 
            data = {}
        
        report.append(data)
        
    return report


In [None]:
def event_to_df(df, value):
    for k, v in value.items():
        df[k] = v
    

def report_dataframe_sessions(response):
    # One list per user. Each item of which containing a list of sessions. Each session containing a list of activities.
    l = []
    for user in response: 
        for session in user['sessions']: 
            session_keys = list(session.keys())
            session_keys.remove('activities')

            dic = { key: session[key] for key in session_keys }
            
            try: 
                dic['user_id'] = session['activities'][0]['customDimension'][17]['value']
            except: 
                pass

            l.append(dic)
    
    df = pd.DataFrame(l)
    return df
        

def report_dataframe_activities(response):
    # One list per user. Each item of which containing a list of sessions. Each session containing a list of activities.
    l = []
    for user in response: 
        for session in user['sessions']: 
            session_id = session['sessionId']
            for activity in session['activities']:
                dic = {}
                                
                activity_keys = list(activity.keys())
                
                if 'customDimension' in activity_keys:
                    for i in activity['customDimension']:
                        if 'value' in i.keys():
                            dic[i['index']] = i['value']
                            
                if 18 in dic.keys():       
                    dic['user_id'] = dic.pop(18)
                            
                activity_keys.remove('customDimension')

                for key in activity_keys:
                    dic[key] = activity[key] 
            
                dic['session_id'] = session_id
                l.append(dic)
    df = pd.DataFrame(l)
        
    return df
            

# Calling functions

In [None]:
# Documentation on google analytics dimensions and metrices
# https://ga-dev-tools.web.app/dimensions-metrics-explorer/

analytics = initialize_analyticsreporting()
DIMS = ['ga:clientId', 'ga:deviceCategory', 'ga:userType', 'ga:daysSinceLastSession', 'ga:date', 'ga:country', 'ga:dimension18']
METRICS = ['ga:hits', 'ga:avgSessionDuration', 'ga:pageviews', 'ga:uniquePageviews']

response_users = get_report(analytics, DIMS, METRICS)

## Users

In [None]:
users = report_dataframe(response_users, DIMS, METRICS)

users = users.loc[users.dimension18 != 'NA']

In [None]:
users.clientId.value_counts().describe()

In [None]:
a = users.clientId.value_counts().value_counts()
fig,ax = plt.subplots(1)
ax = a.plot(kind='bar')


plt.yscale('log')
plt.show()

# Target Feature Engineering

1. Check distribution only for users with more than one visits
2. Add column with time difference in days between that and the next session (or today)
3. Define "churned" based on distrubtion of times between sessions. 

- suprsingly much cross device usage
- remove outliers and e.g. company visits

In [None]:
users_churn = users.copy()

# Only select those that have visited more than once
df_group = users_churn.groupby('dimension18').count()['date']
returning_users = df_group[df_group > 1].index

users_churn = users_churn.loc[users_churn['dimension18'].isin(returning_users)]


In [None]:
users_churn['date'] = pd.to_datetime(users_churn['date'], format='%Y%m%d')

today = datetime.now()

users_churn = users_churn.append(users_churn.drop_duplicates("dimension18").assign(date=today), ignore_index=True)

In [None]:

users_churn['delta'] = np.nan
users_churn['latestvisit'] = 0


for i in np.unique(users_churn.dimension18.values): 
    df1 = users_churn.loc[users_churn.dimension18 == i]  
    df1 = df1.sort_values('date', ascending=False)
    
    for x in range(1, df1.shape[0]):
        delta = df1.iloc[x-1]['date'] - df1.iloc[x]['date']
        
        users_churn.loc[users_churn.index == df1.index[x], 'delta'] = delta.days 
    
    users_churn.loc[users_churn.index == df1.index[1], 'latestvisit'] = 1


In [None]:
# what is realistic as a return rate? is it not specific campaigns/ event users come on the page for?

In [None]:
users_churn = users_churn.loc[users_churn.date != today]

In [None]:
users_churn.sort_values(['dimension18', 'date'], ascending=False).head(3)

In [None]:
users_churn_returning = users_churn.loc[users_churn.latestvisit == 0]
returned_users_delta = users_churn_returning['delta']

In [None]:
n = returned_users_delta.value_counts().sort_index()

In [None]:
fig, ax = plt.subplots(1)
ax = n.plot(kind='bar')

fig.set_figwidth(20)
fig.set_figheight(7)

plt.yscale('log')
plt.show()

In [None]:
returned_users_delta.describe()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

delta_describe = returned_users_delta.describe()

mean = delta_describe['mean']
std = delta_describe['std']
variance = delta_describe['std']**2

x = np.arange(mean-0.1, mean+0.11,.001)

f = np.exp(-np.square(x-mean)/2*variance)/(np.sqrt(2*np.pi*variance))

plt.plot(x,f)
plt.show()

In [None]:
users_churn['zscore'] = np.abs((users_churn.delta - returned_users_delta.mean())/returned_users_delta.std(ddof=0))

In [None]:
users_churn.loc[:, 'churned'] = 0
users_churn.loc[users_churn.zscore > 3, 'churned'] = 1 

In [None]:
one_hot = pd.get_dummies(users_churn['deviceCategory'])
users_churn = users_churn.drop('deviceCategory', axis = 1)
# Join the encoded df
users_churn = users_churn.join(one_hot)

In [None]:
users_churn.head(3)

In [None]:
from scipy import stats

users_churn = users_churn.drop(columns=['clientId', 'delta', 'zscore', 'userType', 'daysSinceLastSession', 'country'])

cols = ['hits', 'avgSessionDuration', 'pageviews', 'uniquePageviews']
users_churn.loc[:, cols] = users_churn[cols].apply(pd.to_numeric, errors='coerce')

users_churn.loc[:, cols] = users_churn[cols][(np.abs(stats.zscore(users_churn[cols])) < 3).all(axis=1)]

users_churn.loc[:, 'sessions'] = 1

In [None]:
users_churn.head(2)

In [None]:
l = []

for i in np.unique(users_churn.dimension18.values): 
    dic = {}

    df1 = users_churn.loc[users_churn.dimension18 == i]  
    df1 = df1.sort_values('date', ascending=False)
    
    dic['dimension18'] = i
    
    dic['hits'] = df1.hits.mean()
    dic['avgSessionDuration'] = df1.avgSessionDuration.mean()
    dic['pageviews'] = df1.pageviews.mean()
    dic['uniquePageviews'] = df1.uniquePageviews.mean()
    
    dic['churned'] = df1.churned.values[0]
    dic['sessions'] = df1.sessions.sum()
    
    dic['desktop'] = df1.desktop.sum()
    dic['mobile'] = df1.mobile.sum()
    dic['tablet'] = df1.tablet.sum()
    
    l.append(dic)
    
df_users = pd.DataFrame(l)

In [None]:
# Highly unbalanced data, especially in terms of target: churned
df_users.describe()

In [None]:
import seaborn as sns

# Increase the size of the heatmap.
plt.figure(figsize=(16, 6))

# Set the range of values to be displayed on the colormap from -1 to 1, and set the annotation to True to display the correlation values on the heatmap.
heatmap = sns.heatmap(df_users.corr(), vmin=-1, vmax=1, annot=True)

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

## Sessions and Hits

In [None]:
# INFORMATION ON A SESSION LEVEL
# PURPOSFULLY LIMTIED TO :150
response = report_clientid(analytics, users["clientId"][:100])

In [None]:
len(response)

In [None]:
sessions = report_dataframe_sessions(response)
hits = report_dataframe_activities(response)

In [None]:
users = users.drop(columns=["userType", "daysSinceLastSession", "date", "clientId"])

In [None]:
hits.update(hits.loc[hits['activityType'] == 'EVENT']['event'].apply(lambda x : x["eventCategory"] + "_" + x["eventAction"]))

In [None]:
hits = hits.fillna(0)
hits = hits.loc[~(hits['user_id']==0)]

categorical_cols = ['channelGrouping'] 
df_h = pd.get_dummies(hits, columns = categorical_cols)


## Merge dfs

In [None]:
result = pd.merge(hits, df_users, left_on="user_id", right_on="dimension18").drop(columns=['dimension18'])
result = pd.merge(result, sessions, left_on="user_id", right_on="user_id")


In [None]:
result.shape

# Churnrate

In [None]:
tb_deleted = ['dataSource', 4, 34, 47, 48, 36, 33, 45, 7, 19, 20, 21, 32, 37, 41, 60, 62, 63, 64, 65, 66, 68, 35, 'hostname', 'landingPagePath', 46, 67, 8, 10, 42, 17, 44, 'deviceCategory', 'platform']
cols = list(set(result.columns) & set(tb_deleted))

result = result.drop(columns=cols)
result = result.rename(columns={23: 'user_professionalspecialtycode', 25: 'user_professionaldesidnationcode', 22: 'registrationdate'})


In [None]:
result = result.drop(columns= ['activityTime', 'source', 'medium', 'keyword', 'event', 'session_id', 'activityType', 'registrationdate', 'sessionId', 'pageview'])

# granularity on a user level

# include: user_professionalspecialtycode, user_professionaldesidnationcode
# drop: 'activityTime', 'source', 'medium', 'keyword', 'event', 'session_id', 'activityType'


In [None]:

result.loc[:, 'is_campaign'] = 1
result.loc[result.campaign == '(not set)', 'is_campaign'] = 0
result = result.drop('campaign', axis=1)



for i in ['user_professionalspecialtycode', 'user_professionaldesidnationcode', 'channelGrouping']:

    one_hot = pd.get_dummies(result[i])
    result = result.drop(i, axis = 1)
    result = result.join(one_hot)

In [None]:
result = result[['user_id', 'is_campaign', 'AI', 'FMG', 'IM', 'OS', 'EE', 'GD', 'NR', '(Other)',
       'Direct', 'Email', 'Referral']].groupby(['user_id']).agg({
    'is_campaign' : 'max', 
    'AI' : 'first', 
    'FMG' : 'first', 
    'IM' : 'first', 
    'OS' : 'first' , 
    'EE' : 'first', 
    'GD' : 'first', 
    'NR' : 'first', 
    '(Other)' : 'mean',
    'Direct' : 'mean', 
    'Email' : 'mean', 
    'Referral' : 'mean'
})

In [None]:
result_ = pd.merge(result, df_users, left_index=True, right_on="dimension18").drop(columns=['dimension18'])


In [None]:
result_

In [None]:
import xgboost

# evaluate xgboost random forest algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBRFClassifier

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')
X = result_.drop(['churned'], axis=1)
enc.fit(X)
X = enc.transform(X).toarray()

y = result_['churned']
# define the model
model = XGBRFClassifier(n_estimators=100, subsample=0.9, colsample_bynode=0.2)
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# Clustering

In [None]:
from sklearn.cluster import KMeans

clusters = 3
model = KMeans(init = 'k-means++', 
               n_clusters = clusters, 
               n_init = 12)
model.fit(X)

labels = model.labels_
print(labels[:100])

In [None]:
df['cluster_num'] = labels
df.head()

In [None]:
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D # 3d plot


fig = plt.figure(1)
plt.clf()
ax = Axes3D(fig, 
            rect = [0, 0, .95, 1], 
            elev = 48, 
            azim = 134)

plt.cla()
ax.scatter(result_['sessions'], result_['pageviews'], result_['hits'], 
           c = df['cluster_num'], 
           s = 200, 
           cmap = 'spring', 
           alpha = 0.5, 
           edgecolor = 'darkgrey')

ax.set_ylabel('pageviews', 
              fontsize = 16)
ax.set_zlabel('hits', 
              fontsize = 16)

plt.savefig('3d_plot.png')
plt.show()