NOTE - Run data/download_datasets.ipynb if not already done, to download all data files required to run this file

## Description:

This file contains:
* **Opt-1: train on past 7 days eligible articles - same model - predict upto 7 days including all articles repeating from past days** 
    * with HP-tuned and noise-reduction via cosine and topic_size=1 articles assigned to noise

This exp is being done for the purposes of:
* ..

In [None]:
# predicted - show topic, distinct tier describe

In [1]:
""" Helper imports"""
import joblib
import pandas as pd
import numpy as np
import time
from time import time
from nltk.tokenize import sent_tokenize
from joblib import Parallel, delayed
from sklearn.metrics.pairwise import cosine_similarity

"""Modeling imports"""
import umap
import hdbscan
from sentence_transformers import SentenceTransformer
from hdbscan import validity_index

""" Plotting imports"""
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

# remove this later
pd.set_option('max_colwidth', 110)
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys
# add the parent directory path
sys.path.insert(0, '..')

from src.util import (c_tf_idf, 
                      extract_top_n_words_per_topic,
                      extract_topic_sizes,
                      topic_cos_sim_metrics,
                      generate_all_cossim_metrics)

In [3]:
data_path = '/home/jupyter/bi-topic-modeling/data/' 
results_path = '/home/jupyter/bi-topic-modeling/results/' 

SEED = 42

STOP_WORDS = pd.read_pickle(data_path + 'forbes_stop_words.pkl')
"q1" in STOP_WORDS

True

### --- Data ---

* Train date range Apr 26 (monday) till May 02 (sunday)
* Eligible pool = articles with pvs > 100 on a given day. In prod this will be replaced by pvs>350 summed over 7 days

In [4]:
def get_data(file_name):
    
    df = pd.read_csv(data_path + file_name)
    
    # Per verdict from EDA, exclude articles less than 100 words
    df["article_length"] = df.clean_body.str.split().apply(len)
    df = df[df["article_length"]>=100]
    print("Shape:", df.shape)
    
    df.tier1 = df.tier1.fillna('Other')
    df = df.reset_index(drop=True)
    
    return df

In [5]:
# training data set

print("--- apr 25 ---")
apr25 = get_data('processed/train_apr25_over100pvs_processed.csv')
apr25_art_embed = joblib.load(data_path + "/processed/apr25_article_embed_axis0.joblib")
print("Embeds len:", len(apr25_art_embed))

print()
print("--- apr 26 ---")
apr26 = get_data('processed/pred_apr26_over100pvs_processed.csv')
apr26_art_embed = joblib.load(data_path + "processed/apr26_article_embed_axis0.joblib")
print("Embeds len:", len(apr26_art_embed))

print()
print("--- apr 27 ---")
apr27 = get_data('processed/pred_apr27_over100pvs_processed.csv')
apr27_art_embed = joblib.load(data_path + "processed/apr27_article_embed_axis0.joblib")
print("Embeds len:", len(apr27_art_embed))

print()
print("--- apr 28 ---")
apr28 = get_data('processed/apr28_over100pvs_processed.csv')
apr28_art_embed = joblib.load(data_path + "processed/apr28_article_embed_axis0.joblib")
print("Embeds len:", len(apr28_art_embed))

print()
print("--- apr 29 ---")
apr29 = get_data('processed/apr29_over100pvs_processed.csv')
apr29_art_embed = joblib.load(data_path + "processed/apr29_article_embed_axis0.joblib")
print("Embeds len:", len(apr29_art_embed))

print()
print("--- apr 30 ---")
apr30 = get_data('processed/apr30_over100pvs_processed.csv')
apr30_art_embed = joblib.load(data_path + "processed/apr30_article_embed_axis0.joblib")
print("Embeds len:", len(apr30_art_embed))

print("--- may 01 ---")
may01 = get_data('processed/may01_over100pvs_processed.csv')
may01_art_embed = joblib.load(data_path + "processed/may01_article_embed_axis0.joblib")
print("Embeds len:", len(may01_art_embed))

print()
print("--- may 02 ---")
may02 = get_data('processed/may02_over100pvs_processed.csv')
may02_art_embed = joblib.load(data_path + "processed/may02_article_embed_axis0.joblib")
print("Embeds len:", len(may02_art_embed))

--- apr 25 ---
Shape: (3327, 11)
Embeds len: 3327

--- apr 26 ---
Shape: (4128, 11)
Embeds len: 4128

--- apr 27 ---
Shape: (4167, 11)
Embeds len: 4167

--- apr 28 ---
Shape: (4186, 9)
Embeds len: 4186

--- apr 29 ---
Shape: (4188, 9)
Embeds len: 4188

--- apr 30 ---
Shape: (3771, 9)
Embeds len: 3771
--- may 01 ---
Shape: (3044, 9)
Embeds len: 3044

--- may 02 ---
Shape: (3259, 9)
Embeds len: 3259


In [6]:
# testing data set

print()
print("--- may 03 ---")
may03 = get_data('processed/may03_over100pvs_processed.csv')
may03_art_embed = joblib.load(data_path + "processed/may03_article_embed_axis0.joblib")
print("Embeds len:", len(may03_art_embed))

print()
print("--- may 04 ---")
may04 = get_data('processed/may04_over100pvs_processed.csv')
may04_art_embed = joblib.load(data_path + "processed/may04_article_embed_axis0.joblib")
print("Embeds len:", len(may04_art_embed))

print()
print("--- may 05 ---")
may05 = get_data('processed/may05_over100pvs_processed.csv')
may05_art_embed = joblib.load(data_path + "processed/may05_article_embed_axis0.joblib")
print("Embeds len:", len(may05_art_embed))

print()
print("--- may 06 ---")
may06 = get_data('processed/may06_over100pvs_processed.csv')
may06_art_embed = joblib.load(data_path + "processed/may06_article_embed_axis0.joblib")
print("Embeds len:", len(may06_art_embed))

print()
print("--- may 07 ---")
may07 = get_data('processed/may07_over100pvs_processed.csv')
may07_art_embed = joblib.load(data_path + "processed/may07_article_embed_axis0.joblib")
print("Embeds len:", len(may07_art_embed))

print()
print("--- may 08 ---")
may08 = get_data('processed/may08_over100pvs_processed.csv')
may08_art_embed = joblib.load(data_path + "processed/may08_article_embed_axis0.joblib")
print("Embeds len:", len(may08_art_embed))

print()
print("--- may 09 ---")
may09 = get_data('processed/may09_over100pvs_processed.csv')
may09_art_embed = joblib.load(data_path + "processed/may09_article_embed_axis0.joblib")
print("Embeds len:", len(may09_art_embed))


--- may 03 ---
Shape: (4199, 9)
Embeds len: 4199

--- may 04 ---
Shape: (4223, 9)
Embeds len: 4223

--- may 05 ---
Shape: (4160, 9)
Embeds len: 4160

--- may 06 ---
Shape: (4063, 9)
Embeds len: 4063

--- may 07 ---
Shape: (3630, 9)
Embeds len: 3630

--- may 08 ---
Shape: (3025, 9)
Embeds len: 3025

--- may 09 ---
Shape: (3231, 9)
Embeds len: 3231


# Opt-1

* train on past 7 days eligible articles - same model - predict upto 7 days including all articles repeating from past days

### --- Combine train data - 7 days --- 
* apr 26 (monday) till may 2 (sunday)
* Observations on training clustering performance - same as the one in file shared for review

In [7]:
# concatenate dfs

combined_df = pd.concat([apr26, apr27, apr28, apr29, apr30, may01, may02])
# reset index
combined_df = combined_df.reset_index(drop=True)
# drop unnecessary cols
combined_df.drop(["processed", "processed_noun", "processed_noun_2"], axis = 1, inplace=True)
# fill na
combined_df.tier2 = combined_df.tier2.fillna('Other')
combined_df.total_timeonpage = combined_df.total_timeonpage.fillna(0)
print(combined_df.shape)

# concatenate embeds
combined_embeds = apr26_art_embed + apr27_art_embed + apr28_art_embed + apr29_art_embed + apr30_art_embed + may01_art_embed + may02_art_embed
print(len(combined_embeds))

(26743, 9)
26743


In [8]:
# drop duplicates from df

train_df = combined_df.drop_duplicates(subset="content_natid", keep='first')

print(train_df.shape)
print(train_df.content_natid.duplicated(keep='first').any())

train_embeds =  [combined_embeds[i] for i in list(train_df.index)]
print(len(train_embeds))

(7238, 9)
False
7238


In [9]:
train_df.reset_index(drop=True, inplace=True)

train_df = train_df[['content_natid', 'clean_body', 'tier1']]
train_df.shape

(7238, 3)

### --- HP-tuned Modeling ---

In [10]:
data = train_df.clean_body.tolist()
print(len(data))

7238


In [11]:
def evaluate_dbcv_score(embeds, pred_labels):
    return validity_index(embeds, pred_labels)

def evaluate_params(train_embeds, min_dist_params, n_neighbors_params, n_components_params, 
                    min_cluster_size_params, min_samples_params):
    
    # initialize
    best_score, best_cfg = 0, None
    
    # manually loop over parameter lists
    for mdis in min_dist_params:
        for neigh in n_neighbors_params:
            for comp in n_components_params:
                for clust in min_cluster_size_params:
                    for samp in min_samples_params:

                        # fit algo with this config
                        order = (mdis, neigh, comp, clust, samp)

                        try:
                            # reduce dimensions
                            umap_embeddings_hp = umap.UMAP(n_neighbors= neigh,
                                 min_dist = mdis,
                                 n_components = comp,
                                 random_state = SEED,
                                ).fit(train_embeds)

                            # cluster
                            cluster_hp = hdbscan.HDBSCAN(min_cluster_size = clust,
                                                  min_samples = samp,
                                                  metric= 'euclidean',                      
                                                  cluster_selection_method='eom',
                                                  prediction_data=True).fit(umap_embeddings_hp.embedding_)

                            # calculate dbcv
                            labels = cluster_hp.labels_
                            score = evaluate_dbcv_score(umap_embeddings_hp.embedding_.astype('double'), labels)

                            print('params%s score=%.3f' % (order, score))

                            # optimize dbcv - the bigger the better
                            if score > best_score:
                                best_score, best_cfg = score, order
                        except:
                            continue
    print('Best params=%s Best Score=%.3f' % ( best_cfg, best_score ))
    
    # return best configuration
    return best_cfg

In [12]:
min_dist_params = [0.0, 0.1] 
n_neighbors_params = [12, 15, 20] 
n_components_params = [2, 5] 
min_cluster_size_params = [5, 10] 
min_samples_params = [4, 5] 

In [13]:
import time

start = time.time()

best_cfg = evaluate_params(train_embeds, 
                           min_dist_params,
                           n_neighbors_params, 
                           n_components_params, 
                           min_cluster_size_params, 
                           min_samples_params)

print()
end = time.time()
print("time (mins)", round((end - start)/60, 2)) # 15mins

params(0.0, 12, 2, 5, 4) score=0.328
params(0.0, 12, 2, 5, 5) score=0.341
params(0.0, 12, 2, 10, 4) score=0.241
params(0.0, 12, 2, 10, 5) score=0.250
params(0.0, 12, 5, 5, 4) score=0.318
params(0.0, 12, 5, 5, 5) score=0.318
params(0.0, 12, 5, 10, 4) score=0.253
params(0.0, 12, 5, 10, 5) score=0.272
params(0.0, 15, 2, 5, 4) score=0.300
params(0.0, 15, 2, 5, 5) score=0.328
params(0.0, 15, 2, 10, 4) score=0.224
params(0.0, 15, 2, 10, 5) score=0.282
params(0.0, 15, 5, 5, 4) score=0.304
params(0.0, 15, 5, 5, 5) score=0.305
params(0.0, 15, 5, 10, 4) score=0.224
params(0.0, 15, 5, 10, 5) score=0.235
params(0.0, 20, 2, 5, 4) score=0.261
params(0.0, 20, 2, 5, 5) score=0.295
params(0.0, 20, 2, 10, 4) score=0.239
params(0.0, 20, 2, 10, 5) score=0.244
params(0.0, 20, 5, 5, 4) score=0.268
params(0.0, 20, 5, 5, 5) score=0.283
params(0.0, 20, 5, 10, 4) score=0.230
params(0.0, 20, 5, 10, 5) score=0.255
params(0.1, 12, 2, 5, 4) score=0.277
params(0.1, 12, 2, 5, 5) score=0.289
params(0.1, 12, 2, 10, 4) 

* Fit model with tuned parameters

In [13]:
best_cfg = (0.0, 12, 2, 5, 5)

In [14]:
print(best_cfg)

# dimensionality reduction
umap_embeddings =  umap.UMAP(min_dist = best_cfg[0],
                             n_neighbors = best_cfg[1],
                             n_components= best_cfg[2],
                             random_state=SEED,
                            ).fit(train_embeds)

# cluster
cluster = hdbscan.HDBSCAN(min_cluster_size= best_cfg[3],
                      min_samples = best_cfg[4],
                      metric= 'euclidean',                      
                      cluster_selection_method='eom',
                      prediction_data=True).fit(umap_embeddings.embedding_)

(0.0, 12, 2, 5, 5)


In [15]:
print("Shape of training embeddings:", umap_embeddings.embedding_.shape)

Shape of training embeddings: (7238, 2)


In [16]:
labels = cluster.labels_
print("Settings - mean axis=0")
validity_index(umap_embeddings.embedding_.astype('double'), labels)

Settings - mean axis=0


0.34083637100782105

In [17]:
# put topics in dataframe
docs_df = pd.DataFrame(data, columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))

print("Distinct topics including -1 for noise", len(docs_df.Topic.unique()))
print()
docs_df

Distinct topics including -1 for noise 280



Unnamed: 0,Doc,Topic,Doc_ID
0,"10 robot vacuums that clean so you dont have to. if youre like most people, vacuuming is low on your list ...",-1,0
1,these 11 mattress toppers give you the bed of your dreams. while new pillows or a new set of sheets can do...,64,1
2,"wayfairs biggest sale of the year is officially here. thanks to the pandemic, most of us have spent the pa...",70,2
3,apple ios 14.5 released: massive iphone update with cool features & important fixes. april 28 update below...,60,3
4,apple airtag: all you need to know about the game-changing new gadget. april 28 update below. this post wa...,-1,4
...,...,...,...
7233,"the oilman, the playmate, and the tangled affairs of the billionaire marshall family. j. howard marshall i...",140,7233
7234,5 surprising foods that help you sleep. cherries: the new melatonin supplement (image via wikipedia) june ...,35,7234
7235,hacker's demo shows how easily credit cards can be read through clothes and wallets. some blank credit car...,31,7235
7236,"the five richest pastors in nigeria. london-based nigerian pastor matthew ashimolowo god is good, especial...",172,7236


In [18]:
# combine together all articles belonging to same cluster  
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

# calculate countVector and tf-idf
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(data))

# top n words, topic size
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); 

# modify top_n_words per topic dict into dataframe for presentation
topic_words_dict = {}  

for k, v in top_n_words.items():
    topic_words_dict[k] = ", ".join([elem[0] for elem in v])

pd.set_option("max_colwidth", 250)
top_n_words_df = pd.DataFrame.from_dict(topic_words_dict, 
                                        orient='index').reset_index().rename(columns={'index':'topic_num',
                                                                                      
                                                                                      0:'topic_words'}) 

In [19]:
# clean keywords

from collections import OrderedDict
# from fuzzywuzzy import process, fuzz
from itertools import chain

import time
start = time.time()

clean_keys = []
for index, row in top_n_words_df.iterrows(): 
    
    # remove STOP WORDS and remove whitespaces
    inter_list = [x.strip() for x in row["topic_words"].split(",") if x not in STOP_WORDS]
    
    # remove numbers but not alphanumerics e.g. remove 400, not ps4 
    inter_list = [x for x in inter_list if not x.isnumeric()]
    
    # remove subset-based similar words - e.g. 'pcr tests' and 'tests' then 'tests' will be removed
    inter_list = [i for i in inter_list if not any(i in x and i!=x for x in inter_list)]

    # remove empty or 1-letter words
    inter_list = [x for x in inter_list if len(x)>1]
    
    # de-duplicate
    inter_list = list(OrderedDict.fromkeys(inter_list))
    
    # keep top 6 cleaned keywords
    inter_list = inter_list[:6] 
    
    # convert into string
    inter_list = ', '.join(inter_list)
    
    clean_keys.append(inter_list)

end = time.time()
print("keyword time", (end - start)/60) 

keyword time 0.0004668235778808594


In [20]:
top_n_words_df["topic_words"] = clean_keys

In [21]:
# GENERATE TOPIC METRICS

# per topic - cos_min and cos_mean
train_cossim_df = generate_all_cossim_metrics(docs_df, train_embeds)

# per topic - tier counts
tier_metric = docs_df.join(train_df[["tier1"]])

iab_metric = tier_metric.groupby("Topic").agg({"tier1": 
                                               'nunique'}).round(2).reset_index().rename(columns = {'tier1': 
                                                                                                    'distinct_tier1s',
                                                                                                    'Topic':
                                                                                                    'topic_num'})

# combine topic, top words, cos and tier metrics
topic_info = pd.merge(pd.merge(pd.merge(top_n_words_df, 
                               topic_sizes, how="inner", left_on="topic_num", right_on = "Topic"), 
                               train_cossim_df, on = "topic_num", how="inner"), 
                      iab_metric, on = "topic_num", how="inner")

topic_info.drop('Topic', axis=1, inplace=True)

In [22]:
topic_info.head()

Unnamed: 0,topic_num,topic_words,Size,cos_min,cos_mean,distinct_tier1s
0,-1,"coffee, age, china, government, leaders, women",2357,-0.03,0.72,31
1,0,"10 states, average annual, annual wage, salaries, occupational, therapist salary",34,0.77,0.9,8
2,1,"kentucky derby, essential quality, horses, churchill downs, race, triple crown",18,0.66,0.85,4
3,2,"million viewers, 000 viewers, fox news, msnbc, cable news, prime time",7,0.95,0.97,3
4,3,"packers, quarterback, round, rodgers, nfl draft, green bay",69,0.68,0.88,3


In [23]:
topic_info[['cos_min', 'cos_mean', 'distinct_tier1s', 'Size']].describe().apply(lambda x: round(x, 3))

Unnamed: 0,cos_min,cos_mean,distinct_tier1s,Size
count,280.0,280.0,280.0,280.0
mean,0.755,0.879,4.636,25.85
std,0.091,0.034,3.003,140.567
min,-0.03,0.72,1.0,5.0
25%,0.71,0.86,3.0,8.0
50%,0.76,0.88,4.0,13.0
75%,0.81,0.9,5.0,22.0
max,0.95,0.97,31.0,2357.0


**Topics are about:**


In [24]:
# combine natural id inside docs_df

docs_df = pd.merge(docs_df, train_df, how="left", left_on="Doc", right_on="clean_body")
docs_df.drop("clean_body", axis=1, inplace=True)

In [25]:
# contains natid, article body, assigned topic

full_train_topics = pd.merge(docs_df, topic_info, how="left", left_on="Topic", right_on="topic_num")
full_train_topics.head(1)

Unnamed: 0,Doc,Topic,Doc_ID,content_natid,tier1,topic_num,topic_words,Size,cos_min,cos_mean,distinct_tier1s
0,"10 robot vacuums that clean so you dont have to. if youre like most people, vacuuming is low on your list of favorite chores. you can avoid cleaning, or enlist the aid of a robot vacuum to clean your home autonomously. the best robot vacuums are ...",-1,0,blogandpostid/blog/post/4983-5ea87d15960ddf000681c604,Shopping,-1,"coffee, age, china, government, leaders, women",2357,-0.03,0.72,31


* Training Noise

In [26]:
round(full_train_topics[full_train_topics.Topic==-1].shape[0]/full_train_topics.shape[0]*100)

33

### Opt-1 Prediction
* same model predict on 7 days with repeating articles

**Training data**

* Date range 7 days -- Apr 26 (monday) till May 02 (sunday)

* Eligible pool = articles with pvs threshold = 7,238 articles
    * **Distinct topics including -1 for noise :: 328**

**Prediction data**

* Date range -- may 3 (monday) till May 09 (friday)
* May 3 to May 9 repeating articles present
* Eligible pool = articles with pvs > 100

In [27]:
print(umap_embeddings)
print()
print(cluster)

UMAP(dens_frac=0.0, dens_lambda=0.0, min_dist=0.0, n_neighbors=12,
     random_state=42)

HDBSCAN(min_samples=5, prediction_data=True)


In [28]:
top_n_words_df.head()

Unnamed: 0,topic_num,topic_words
0,-1,"coffee, age, china, government, leaders, women"
1,0,"10 states, average annual, annual wage, salaries, occupational, therapist salary"
2,1,"kentucky derby, essential quality, horses, churchill downs, race, triple crown"
3,2,"million viewers, 000 viewers, fox news, msnbc, cable news, prime time"
4,3,"packers, quarterback, round, rodgers, nfl draft, green bay"


In [29]:
# this function needs top_n_words_df defined above. so run above cells first

def predict(umap_embeddings, cluster, embed_list, pred_df):
    
    '''
    reusable predict function - 
    1. calculates predictions on incoming data and 
    2. clustering quality metrics like cossine similarirty min, mean, topic size etc
    '''
    
    # dimensionality reduction
    pred_umap_embeddings = umap_embeddings.transform(embed_list)

    print("Training input shape:", umap_embeddings.embedding_.shape)
    print("Prediction input shape:", pred_umap_embeddings.shape) #takes a min
    print()
    
    # predict
    pred_test_labels, pred_strengths = hdbscan.approximate_predict(cluster, pred_umap_embeddings)
    
    # put topics in dataframe
    pred_docs_df = pred_df[['clean_body']]
    pred_docs_df.rename(columns={'clean_body':'Doc'}, inplace=True)
    pred_docs_df["Topic"] = pred_test_labels

    print("Distinct topics including -1 for noise", len(pred_docs_df.Topic.unique()))
    
    # combine natural id inside docs_df

    pred_docs_df = pd.merge(pred_docs_df, pred_df, how="left", left_on="Doc", right_on="clean_body")
    pred_docs_df.drop("clean_body", axis=1, inplace=True)
    
    pred_full = pd.merge(pred_docs_df, 
         top_n_words_df, 
         how="left", 
         left_on="Topic", 
         right_on = "topic_num").drop('topic_num', axis=1)
    
    return pred_full, pred_umap_embeddings, pred_strengths

In [30]:
import scipy
from sklearn.metrics.pairwise import cosine_similarity

# this function needs top_n_words_df
# uses original article embeddings (not dimensionality reduced ones) to compute topic-embeddings &
# to compute cos-sim-score b/w article and topic-embedding

def cluster_noise(pred_df, art_embed):
    
    '''
    reusable function which - 
    1. calculates an embedding representation of every trained topic  
    2. each day, assigns noise point to closest matching topic-embedding with threshold >=0.93
    '''
    
    nonnoise_df = pred_df[pred_df.Topic != -1].copy()
    noise_df = pred_df[pred_df.Topic == -1].copy()
    
    print("Non-Noise DF: ", nonnoise_df.shape)
    print("Noise DF: ", noise_df.shape)
    
    nonnoise_embeds = [art_embed[i] for i in list(nonnoise_df.index)] 
    noise_embeds = [art_embed[i] for i in list(noise_df.index)] 

    print("Non-Noise embed len: ", len(nonnoise_embeds))
    print("Noise embed len: ", len(noise_embeds))

    nonnoise_df.reset_index(drop=True, inplace=True)
    noise_df.reset_index(drop=True, inplace=True)
    
    topic_embeddings = []

    for topic_num in sorted(nonnoise_df.Topic.unique()):

        topic_group = nonnoise_df[nonnoise_df.Topic == topic_num]

        # grab all articles of given topic
        group_embeddings = [nonnoise_embeds[i] for i in list(topic_group.index)]

        # calculate average embedding for each topic
        topic_embeddings.append(np.mean(group_embeddings, axis=0))

    print("Total topics", len(topic_embeddings))
    
    closest_topic = []
    closest_cosine_score = []

    for i in range(len(noise_embeds)):
        
        # insert the noise article at position 0 of list
        topic_embeddings.insert(0, noise_embeds[i])

        # convert list to sparse matrix
        sparse_matrix= scipy.sparse.csr_matrix(topic_embeddings)

        # full matrix of cosine similarity of noise article with every topic-embeddibng
        cos_mat = cosine_similarity(sparse_matrix)
        
        # select the max cosine score of noise with topic
        closest_cosine_score.append(sorted(cos_mat[0][1:])[-1])
        
        # if max cosine score passes threshold:
        if sorted(cos_mat[0][1:])[-1] >=0.93:
            # find closest topic
            closest_topic.append(sorted(nonnoise_df.Topic.unique())[np.argmax(cos_mat[0][1:])])
        else:
            # keep as noise
            closest_topic.append(-1)

        # remove current noise article from position 0
        topic_embeddings.pop(0) # 4mins
        
    noise_df["cos_closest_topic"] = closest_topic
    noise_df["cosine_sim_score"] = closest_cosine_score

    cossin_insp = pd.merge(noise_df[['content_natid', 'cos_closest_topic', 'cosine_sim_score']], 
                           top_n_words_df, 
                           how="left", 
                           left_on="cos_closest_topic", 
                           right_on="topic_num")
    
    return cossin_insp 

In [31]:
# NOTE - saving these topic embeddings dont make a whole lot of sense. 
# Recomputing topic embeddings every day just takes 4 - 5 mins as such

In [32]:
def prep_final_pred(pred_df, clust_noise_df):
    
    print("Pred df: ", pred_df.shape)
    print("Clustered Noise df: ", clust_noise_df.shape)
    
    full_df = pd.merge(pred_df[['content_natid', 'Doc', 'Topic', 'topic_words', 'pvs', 'total_timeonpage', 'pub_date',
                                'tier1', 'tier2', 'article_length']], 
                     clust_noise_df, 
                     on="content_natid", 
                     how="left")
    
    full_df.rename(columns={
        'Topic': 'orig_pred_topic', 
        'topic_words_x': 'orig_topic_labels',
        'cos_closest_topic': 'noise_closest_topic', 
        'topic_words_y': 'noise_topic_labels'}, 
                     inplace=True)
    
    full_df['Topic'] = full_df.noise_closest_topic.fillna(full_df.orig_pred_topic)
    full_df['final_topic_labels'] = full_df.noise_topic_labels.fillna(full_df.orig_topic_labels)
    
    full_df.Topic = full_df.Topic.astype(int)
    full_df.noise_closest_topic =full_df.noise_closest_topic.astype('Int64')
    
    pred_topic_sizes = extract_topic_sizes(full_df)
    
    # find the topics with size=1
    size1_topics = pred_topic_sizes[pred_topic_sizes.Size==1].Topic.values
    
    # replace their topics to -1 i.e. assign those articles to noise
    full_df.Topic = np.where(full_df.Topic.isin(size1_topics), 
                    -1,
                    full_df.Topic)

    return full_df

In [33]:
def generate_topic_metrics(pred_docs_df, pred_umap_embeddings):
    
    # GENERATE TOPIC METRICS
    pred_topic_sizes = extract_topic_sizes(pred_docs_df)

    # per topic - cos_min and cos_mean
    pred_cossim_df = generate_all_cossim_metrics(pred_docs_df, pred_umap_embeddings)

    # per topic - tier counts
    iab_metric = pred_docs_df.groupby("Topic").agg({"tier1": 
                                                   'nunique'}).round(2).reset_index().rename(columns = {'tier1': 
                                                                                                        'distinct_tier1s',
                                                                                                        'Topic':
                                                                                                        'topic_num'})

    # combine topic, top words, cos and tier metrics
    topic_info = pd.merge(pd.merge(pred_topic_sizes, 
                                            pred_cossim_df, how="inner", left_on="Topic", right_on = "topic_num"), 
                                   iab_metric, on = "topic_num", how="inner")

    topic_info.drop('Topic', axis=1, inplace=True)

    # combine topic_info with topic names

    topic_info = pd.merge(topic_info, top_n_words_df, how="left", on="topic_num").sort_values("topic_num")
    
    return topic_info

* **Predict new points/articles - May 3**

In [34]:
start = time.time()

pred_may03, may03_pred_embeds, may03_strengths = predict(umap_embeddings, cluster, 
                                                            may03_art_embed, may03) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2)) 

print("Noise before:")
print(pred_may03[pred_may03.Topic==-1].shape) # earlier 310 topics

Training input shape: (7238, 2)
Prediction input shape: (4199, 2)

Distinct topics including -1 for noise 267

time (mins):  0.29
Noise before:
(1920, 11)


In [35]:
start = time.time()

may03_noise_clust = cluster_noise(pred_may03, may03_art_embed) 
may03_noise_clust.drop("topic_num", axis=1, inplace=True)

may03_full_pred = prep_final_pred(pred_may03, may03_noise_clust) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2)) 

print("Noise After:")
print(may03_full_pred[may03_full_pred.Topic==-1].shape)

Non-Noise DF:  (2279, 11)
Noise DF:  (1920, 11)
Non-Noise embed len:  2279
Noise embed len:  1920
Total topics 266
Pred df:  (4199, 11)
Clustered Noise df:  (1920, 4)

time (mins):  3.16
Noise After:
(1011, 15)


In [36]:
may3_topic_info = generate_topic_metrics(may03_full_pred, may03_art_embed)
may3_topic_info[['cos_min', 'cos_mean', 'distinct_tier1s', 'Size']][1:].describe().apply(lambda x: round(x, 3))

Unnamed: 0,cos_min,cos_mean,distinct_tier1s,Size
count,251.0,251.0,251.0,251.0
mean,0.796,0.889,3.793,12.701
std,0.08,0.038,2.208,9.765
min,0.56,0.74,1.0,2.0
25%,0.75,0.87,2.0,6.0
50%,0.8,0.9,3.0,11.0
75%,0.85,0.91,5.0,16.5
max,0.97,0.97,14.0,58.0


In [37]:
may03_full_pred.to_csv("may03_final_eval.csv", index=False)

In [38]:
# only 9 high pageview articles (pv>=10k) lost to noise on May 3

pvs_dist = pd.merge(may03_noise_clust, 
                    may03[['content_natid', 'pvs']], 
                    on="content_natid", how="left")

ranges = [100,1000,5000,10000, 50000,100000]
pvs_dist['content_natid'].groupby(pd.cut(pvs_dist.pvs, ranges)).count()

pvs
(100, 1000]        1734
(1000, 5000]        148
(5000, 10000]        26
(10000, 50000]        8
(50000, 100000]       1
Name: content_natid, dtype: int64

* **Predict new points/articles - May 04**

In [39]:
start = time.time()

pred_may04, may04_pred_embeds, may04_strengths = predict(umap_embeddings, cluster, 
                                                            may04_art_embed, may04) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2)) 

print("Noise before:")
print(pred_may04[pred_may04.Topic==-1].shape) 

Training input shape: (7238, 2)
Prediction input shape: (4223, 2)

Distinct topics including -1 for noise 271

time (mins):  0.09
Noise before:
(2000, 11)


In [40]:
start = time.time()

may04_noise_clust = cluster_noise(pred_may04, may04_art_embed) 
may04_noise_clust.drop("topic_num", axis=1, inplace=True)

may04_full_pred = prep_final_pred(pred_may04, may04_noise_clust) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2))

print("Noise After:")
print(may04_full_pred[may04_full_pred.Topic==-1].shape)

Non-Noise DF:  (2223, 11)
Noise DF:  (2000, 11)
Non-Noise embed len:  2223
Noise embed len:  2000
Total topics 270
Pred df:  (4223, 11)
Clustered Noise df:  (2000, 4)

time (mins):  3.38
Noise After:
(1035, 15)


In [41]:
may4_topic_info = generate_topic_metrics(may04_full_pred, may04_art_embed)
may4_topic_info[['cos_min', 'cos_mean', 'distinct_tier1s', 'Size']][1:].describe().apply(lambda x: round(x, 3))

Unnamed: 0,cos_min,cos_mean,distinct_tier1s,Size
count,254.0,254.0,254.0,254.0
mean,0.796,0.888,3.854,12.551
std,0.1,0.043,2.197,9.88
min,0.34,0.69,1.0,2.0
25%,0.75,0.87,2.0,5.0
50%,0.81,0.895,3.0,10.0
75%,0.86,0.92,5.0,17.0
max,0.97,0.97,14.0,52.0


In [50]:
may04_full_pred.to_csv("may04_final_eval.csv", index=False)

* **Predict new points/articles - May 05**

In [42]:
start = time.time()

pred_may05, may05_pred_embeds, may05_strengths = predict(umap_embeddings, cluster, 
                                                            may05_art_embed, may05) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2)) 

print("Noise before:")
print(pred_may05[pred_may05.Topic==-1].shape) 

Training input shape: (7238, 2)
Prediction input shape: (4160, 2)

Distinct topics including -1 for noise 267

time (mins):  0.07
Noise before:
(1912, 11)


In [43]:
start = time.time()

may05_noise_clust = cluster_noise(pred_may05, may05_art_embed) 
may05_noise_clust.drop("topic_num", axis=1, inplace=True)

may05_full_pred = prep_final_pred(pred_may05, may05_noise_clust) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2))

Non-Noise DF:  (2248, 11)
Noise DF:  (1912, 11)
Non-Noise embed len:  2248
Noise embed len:  1912
Total topics 266
Pred df:  (4160, 11)
Clustered Noise df:  (1912, 4)

time (mins):  3.16


In [44]:
print("Noise After:")
print(may05_full_pred[may05_full_pred.Topic==-1].shape)

may5_topic_info = generate_topic_metrics(may05_full_pred, may05_art_embed)
may5_topic_info[['cos_min', 'cos_mean', 'distinct_tier1s', 'Size']][1:].describe().apply(lambda x: round(x, 3))

Noise After:
(990, 15)


Unnamed: 0,cos_min,cos_mean,distinct_tier1s,Size
count,254.0,254.0,254.0,254.0
mean,0.797,0.888,3.843,12.48
std,0.088,0.043,2.128,9.322
min,0.43,0.6,1.0,2.0
25%,0.75,0.87,2.0,6.0
50%,0.805,0.89,3.0,10.0
75%,0.86,0.91,5.0,16.75
max,0.97,0.98,13.0,50.0


In [55]:
may05_full_pred.to_csv("may05_final_eval.csv", index=False)

* **Predict new points/articles - May 06**

In [45]:
start = time.time()

pred_may06, may06_pred_embeds, may06_strengths = predict(umap_embeddings, cluster, 
                                                            may06_art_embed, may06) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2)) 

print("Noise before:")
print(pred_may06[pred_may06.Topic==-1].shape) 

Training input shape: (7238, 2)
Prediction input shape: (4063, 2)

Distinct topics including -1 for noise 264

time (mins):  0.09
Noise before:
(1923, 11)


In [46]:
start = time.time()

may06_noise_clust = cluster_noise(pred_may06, may06_art_embed) 
may06_noise_clust.drop("topic_num", axis=1, inplace=True)

may06_full_pred = prep_final_pred(pred_may06, may06_noise_clust) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2))

print("Noise After:")
print(may06_full_pred[may06_full_pred.Topic==-1].shape)

Non-Noise DF:  (2140, 11)
Noise DF:  (1923, 11)
Non-Noise embed len:  2140
Noise embed len:  1923
Total topics 263
Pred df:  (4063, 11)
Clustered Noise df:  (1923, 4)

time (mins):  3.08
Noise After:
(1010, 15)


In [47]:
may6_topic_info = generate_topic_metrics(may06_full_pred, may06_art_embed)
may6_topic_info[['cos_min', 'cos_mean', 'distinct_tier1s', 'Size']][1:].describe().apply(lambda x: round(x, 3))

Unnamed: 0,cos_min,cos_mean,distinct_tier1s,Size
count,254.0,254.0,254.0,254.0
mean,0.793,0.887,3.815,12.02
std,0.092,0.041,2.225,9.022
min,0.37,0.73,1.0,2.0
25%,0.75,0.87,2.0,5.0
50%,0.81,0.9,3.0,10.0
75%,0.86,0.91,5.0,16.0
max,0.97,0.98,13.0,56.0


In [59]:
may06_full_pred.to_csv("may06_final_eval.csv", index=False)

* **Predict new points/articles - May 07**

In [48]:
start = time.time()

pred_may07, may07_pred_embeds, may07_strengths = predict(umap_embeddings, cluster, 
                                                            may07_art_embed, may07) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2)) 

print("Noise before:")
print(pred_may07[pred_may07.Topic==-1].shape) 

Training input shape: (7238, 2)
Prediction input shape: (3630, 2)

Distinct topics including -1 for noise 259

time (mins):  0.08
Noise before:
(1740, 11)


In [49]:
start = time.time()

may07_noise_clust = cluster_noise(pred_may07, may07_art_embed) 
may07_noise_clust.drop("topic_num", axis=1, inplace=True)

may07_full_pred = prep_final_pred(pred_may07, may07_noise_clust) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2))

print("Noise After:")
print(may07_full_pred[may07_full_pred.Topic==-1].shape)

Non-Noise DF:  (1890, 11)
Noise DF:  (1740, 11)
Non-Noise embed len:  1890
Noise embed len:  1740
Total topics 258
Pred df:  (3630, 11)
Clustered Noise df:  (1740, 4)

time (mins):  2.7
Noise After:
(929, 15)


In [50]:
may7_topic_info = generate_topic_metrics(may07_full_pred, may07_art_embed)
may7_topic_info[['cos_min', 'cos_mean', 'distinct_tier1s', 'Size']][1:].describe().apply(lambda x: round(x, 3))

Unnamed: 0,cos_min,cos_mean,distinct_tier1s,Size
count,245.0,245.0,245.0,245.0
mean,0.795,0.885,3.604,11.024
std,0.097,0.045,2.019,8.558
min,0.31,0.67,1.0,2.0
25%,0.76,0.87,2.0,5.0
50%,0.81,0.9,3.0,8.0
75%,0.86,0.91,4.0,14.0
max,0.97,0.98,13.0,43.0


In [63]:
may07_full_pred.to_csv("may07_final_eval.csv", index=False)

* **Predict new points/articles - May 08**

In [51]:
start = time.time()

pred_may08, may08_pred_embeds, may08_strengths = predict(umap_embeddings, cluster, 
                                                            may08_art_embed, may08) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2)) 

print("Noise before:")
print(pred_may08[pred_may08.Topic==-1].shape) 

Training input shape: (7238, 2)
Prediction input shape: (3025, 2)

Distinct topics including -1 for noise 247

time (mins):  0.07
Noise before:
(1414, 11)


In [52]:
start = time.time()

may08_noise_clust = cluster_noise(pred_may08, may08_art_embed) 
may08_noise_clust.drop("topic_num", axis=1, inplace=True)

may08_full_pred = prep_final_pred(pred_may08, may08_noise_clust) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2))

print("Noise After:")
print(may08_full_pred[may08_full_pred.Topic==-1].shape)

Non-Noise DF:  (1611, 11)
Noise DF:  (1414, 11)
Non-Noise embed len:  1611
Noise embed len:  1414
Total topics 246
Pred df:  (3025, 11)
Clustered Noise df:  (1414, 4)

time (mins):  2.01
Noise After:
(832, 15)


In [53]:
may8_topic_info = generate_topic_metrics(may08_full_pred, may08_art_embed)
may8_topic_info[['cos_min', 'cos_mean', 'distinct_tier1s', 'Size']][1:].describe().apply(lambda x: round(x, 3))

Unnamed: 0,cos_min,cos_mean,distinct_tier1s,Size
count,221.0,221.0,221.0,221.0
mean,0.803,0.886,3.498,9.923
std,0.092,0.043,2.006,7.53
min,0.39,0.66,1.0,2.0
25%,0.77,0.86,2.0,4.0
50%,0.81,0.89,3.0,7.0
75%,0.87,0.92,5.0,14.0
max,0.97,0.97,13.0,41.0


In [67]:
may08_full_pred.to_csv("may08_final_eval.csv", index=False)

* **Predict new points/articles - May 09**

In [54]:
start = time.time()

pred_may09, may09_pred_embeds, may09_strengths = predict(umap_embeddings, cluster, 
                                                            may09_art_embed, may09) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2)) 

print("Noise before:")
print(pred_may09[pred_may09.Topic==-1].shape) 



Training input shape: (7238, 2)
Prediction input shape: (3231, 2)

Distinct topics including -1 for noise 254

time (mins):  0.07
Noise before:
(1514, 11)


In [55]:
start = time.time()

may09_noise_clust = cluster_noise(pred_may09, may09_art_embed) 
may09_noise_clust.drop("topic_num", axis=1, inplace=True)

may09_full_pred = prep_final_pred(pred_may09, may09_noise_clust) 

end = time.time()
print()
print("time (mins): ", round((end - start)/60, 2))

print("Noise After:")
print(may09_full_pred[may09_full_pred.Topic==-1].shape)

Non-Noise DF:  (1717, 11)
Noise DF:  (1514, 11)
Non-Noise embed len:  1717
Noise embed len:  1514
Total topics 253
Pred df:  (3231, 11)
Clustered Noise df:  (1514, 4)

time (mins):  2.27
Noise After:
(856, 15)


In [56]:
may9_topic_info = generate_topic_metrics(may09_full_pred, may09_art_embed)
may9_topic_info[['cos_min', 'cos_mean', 'distinct_tier1s', 'Size']][1:].describe().apply(lambda x: round(x, 3))

Unnamed: 0,cos_min,cos_mean,distinct_tier1s,Size
count,228.0,228.0,228.0,228.0
mean,0.804,0.886,3.482,10.417
std,0.09,0.048,1.941,8.233
min,0.38,0.63,1.0,2.0
25%,0.76,0.87,2.0,4.0
50%,0.81,0.9,3.0,8.0
75%,0.87,0.92,5.0,14.0
max,0.97,0.97,12.0,46.0


In [73]:
may09_full_pred.to_csv("may09_final_eval.csv", index=False)

# --- EDA ---

In [57]:
print("Eligible articles:")
print("May 3 -", may03_full_pred.shape)
print("May 4 -", may04_full_pred.shape)
print("May 5 -", may05_full_pred.shape)
print("May 6 -", may06_full_pred.shape)
print("May 7 -", may07_full_pred.shape)
print("May 8 -", may08_full_pred.shape)
print("May 9 -", may09_full_pred.shape)

Eligible articles:
May 3 - (4199, 15)
May 4 - (4223, 15)
May 5 - (4160, 15)
May 6 - (4063, 15)
May 7 - (3630, 15)
May 8 - (3025, 15)
May 9 - (3231, 15)


In [59]:
print("Number of Topics assigned including noise:")
print("May 3 -",len(may03_full_pred.Topic.unique())) # 15 topics less
print("May 4 -",len(may04_full_pred.Topic.unique())) # 16 topics less
print("May 5 -",len(may05_full_pred.Topic.unique())) # 12 topics less
print("May 6 -",len(may06_full_pred.Topic.unique())) # 9 topics leass
print("May 7 -",len(may07_full_pred.Topic.unique())) # 13 topics less
print("May 8 -",len(may08_full_pred.Topic.unique())) # 25 topics less
print("May 9 -",len(may09_full_pred.Topic.unique())) # 25 topics less

Number of Topics assigned including noise:
May 3 - 252
May 4 - 255
May 5 - 255
May 6 - 255
May 7 - 246
May 8 - 222
May 9 - 229


In [60]:
print("Avg topic size:")
print("May 3 -",round(may3_topic_info.Size.mean()))
print("May 4 -",round(may4_topic_info.Size.mean()))
print("May 5 -",round(may5_topic_info.Size.mean()))
print("May 6 -",round(may6_topic_info.Size.mean()))
print("May 7 -",round(may7_topic_info.Size.mean()))
print("May 8 -",round(may8_topic_info.Size.mean()))
print("May 9 -",round(may9_topic_info.Size.mean()))

Avg topic size:
May 3 - 17
May 4 - 17
May 5 - 16
May 6 - 16
May 7 - 15
May 8 - 14
May 9 - 14


In [61]:
a = list(sorted(may03_full_pred.Topic.unique()))
b = list(sorted(may04_full_pred.Topic.unique()))
c = list(sorted(may05_full_pred.Topic.unique()))
d = list(sorted(may06_full_pred.Topic.unique()))
e = list(sorted(may07_full_pred.Topic.unique()))
f = list(sorted(may08_full_pred.Topic.unique()))
g = list(sorted(may09_full_pred.Topic.unique()))

In [84]:
# common topics - OLD

elements_in_all = list(set.intersection(*map(set, [a, b, c, d, e, f, g])))
len(elements_in_all)

214

In [62]:
# common topics

elements_in_all = list(set.intersection(*map(set, [a, b, c, d, e, f, g])))
len(elements_in_all)

183

### Noise

In [63]:
print("# of articles NOISE - ")
print("May 3 -",round(may03_full_pred[may03_full_pred.Topic==-1].shape[0]))
print("May 4 -",round(may04_full_pred[may04_full_pred.Topic==-1].shape[0]))
print("May 5 -",round(may05_full_pred[may05_full_pred.Topic==-1].shape[0]))
print("May 6 -",round(may06_full_pred[may06_full_pred.Topic==-1].shape[0]))
print("May 7 -",round(may07_full_pred[may07_full_pred.Topic==-1].shape[0]))
print("May 8 -",round(may08_full_pred[may08_full_pred.Topic==-1].shape[0]))
print("May 9 -",round(may09_full_pred[may09_full_pred.Topic==-1].shape[0]))

# of articles NOISE - 
May 3 - 1011
May 4 - 1035
May 5 - 990
May 6 - 1010
May 7 - 929
May 8 - 832
May 9 - 856


In [64]:
print("Perc of articles NOISE - ")
print("May 3 -",round(may03_full_pred[may03_full_pred.Topic==-1].shape[0]/may03_full_pred.shape[0]*100))
print("May 4 -",round(may04_full_pred[may04_full_pred.Topic==-1].shape[0]/may04_full_pred.shape[0]*100))
print("May 5 -",round(may05_full_pred[may05_full_pred.Topic==-1].shape[0]/may05_full_pred.shape[0]*100))
print("May 6 -",round(may06_full_pred[may06_full_pred.Topic==-1].shape[0]/may06_full_pred.shape[0]*100))
print("May 7 -",round(may07_full_pred[may07_full_pred.Topic==-1].shape[0]/may07_full_pred.shape[0]*100))
print("May 8 -",round(may08_full_pred[may08_full_pred.Topic==-1].shape[0]/may08_full_pred.shape[0]*100))
print("May 9 -",round(may09_full_pred[may09_full_pred.Topic==-1].shape[0]/may09_full_pred.shape[0]*100))

Perc of articles NOISE - 
May 3 - 24
May 4 - 25
May 5 - 24
May 6 - 25
May 7 - 26
May 8 - 28
May 9 - 26


**Save in BQ**

In [None]:
bq_cols = ['content_natid', 'Topic', 'final_topic_labels']
bq_may3 = may03_full_pred[bq_cols]
bq_may4 = may03_full_pred[bq_cols]
bq_may5 = may03_full_pred[bq_cols]
bq_may6 = may03_full_pred[bq_cols]
bq_may7 = may03_full_pred[bq_cols]
bq_may8 = may03_full_pred[bq_cols]
bq_may3 = may03_full_pred[bq_cols]
9
print("May 4 -", may04_full_pred.shape)
print("May 5 -", may05_full_pred.shape)
print("May 6 -", may06_full_pred.shape)
print("May 7 -", may07_full_pred.shape)
print("May 8 -", may08_full_pred.shape)
print("May 9 -", may09_full_pred.shape)

**C-level vs Non-C-level**

In [65]:
"""Google imports"""
from google.cloud import bigquery

client = bigquery.Client()

In [87]:
sql = """
   SELECT 
       * 
   FROM 
       `api-project-901373404215.lookalike.zoom_info_dm` 
   LIMIT 5000000
   """
#  WHERE date BETWEEN "2021-05-03" and "2021-05-09"

# Send the query to the api and return a df
zi_preds = client.query(sql).to_dataframe()
print("Shape: ", zi_preds.shape)

zi_preds.managementLevel.value_counts()

Shape:  (5000000, 3)


Non-DM    3236480
DM        1763520
Name: managementLevel, dtype: int64

In [69]:
sql = """
   SELECT
     GA_fullVisitorId, GA_cmsNaturalId
    FROM
      `api-project-901373404215.DataMart.v_DataMart_updated` 
   WHERE 
       GA_date BETWEEN "2021-05-03" and "2021-05-09"
   """

# Send the query to the api and return a df
ga = client.query(sql).to_dataframe()
print("Shape: ", ga.shape) #6.44

Shape:  (37768809, 2)


In [70]:
# 25MM unique fullvid in that 1 week
len(ga.GA_fullVisitorId.unique())

25169881

In [88]:
subset = ga[ga.GA_fullVisitorId.isin(zi_preds.GA_fullVisitorId)]

print(subset.shape)
print(len(subset.GA_fullVisitorId.unique()))
subset.head(2)

(984327, 2)
368856


Unnamed: 0,GA_fullVisitorId,GA_cmsNaturalId
29,9999961073753524873,blogandpostid/blog/post/50769-60934a675c40b40006893b41
38,9997538034193996163,blogandpostid/blog/post/4773-605b795bafd8e40006c32a28


In [89]:
zi_nats = pd.merge(subset, zi_preds[['GA_fullVisitorId', 'managementLevel']], on="GA_fullVisitorId", how="left")

# zi_nats = pd.merge(subset, zi_preds[['client', 'managementLevel']], left_on="GA_fullVisitorId", 
#                    right_on = 'client', how="left")

zi_nats[['GA_fullVisitorId', 'managementLevel']].drop_duplicates(keep='first').managementLevel.value_counts()

Non-DM    254503
DM        118603
Name: managementLevel, dtype: int64

In [90]:
print(zi_nats.shape)
zi_nats.head()

(1223385, 3)


Unnamed: 0,GA_fullVisitorId,GA_cmsNaturalId,managementLevel
0,9999961073753524873,blogandpostid/blog/post/50769-60934a675c40b40006893b41,DM
1,9997538034193996163,blogandpostid/blog/post/4773-605b795bafd8e40006c32a28,DM
2,9995014657333051717,blogandpostid/blog/post/1360-6064d9c005bb9a0006238fbd,Non-DM
3,9999615644474572026,blogandpostid/blog/post/1383-608ca73a171509000659ce03,DM
4,9994983057241825546,blogandpostid/blog/post/1016-15628,Non-DM


In [91]:
cols_to_keep = ['content_natid', 'Doc','Topic', 'final_topic_labels']

may3_9_topics = pd.concat([may03_full_pred[cols_to_keep], 
                           may04_full_pred[cols_to_keep], 
                           may05_full_pred[cols_to_keep], 
                           may06_full_pred[cols_to_keep], 
                           may07_full_pred[cols_to_keep], 
                           may08_full_pred[cols_to_keep], 
                           may09_full_pred[cols_to_keep]])

print("Before: ", may3_9_topics.shape)

may3_9_topics.drop_duplicates("content_natid", keep='last', inplace=True)

print("After: ", may3_9_topics.shape)

Before:  (26531, 4)
After:  (6958, 4)


In [92]:
len(zi_nats[zi_nats.GA_cmsNaturalId.isin(may3_9_topics.content_natid)].GA_fullVisitorId.unique())

297277

In [93]:
eda_df = pd.merge(zi_nats, 
                  may3_9_topics, 
                  left_on="GA_cmsNaturalId", right_on="content_natid", how="inner")
print("Before", eda_df.shape)

# remove noise articles
eda_df = eda_df[eda_df.Topic!=-1]
print("After", eda_df.shape)
print("Unique fullvids for eda: ", len(eda_df.GA_fullVisitorId.unique()))

print("Unique natids for eda: ", len(eda_df.content_natid.unique()))

Before (729427, 7)
After (543526, 7)
Unique fullvids for eda:  246229
Unique natids for eda:  4951


In [94]:
# eda_df['keywords_lst'] = eda_df.final_topic_labels.str.split(',')
# eda_df.head(1)

In [95]:
import collections

In [96]:
c_levels = eda_df[eda_df.managementLevel ==  'DM']  # 'C-level'
non_cs = eda_df[eda_df.managementLevel == 'Non-DM']  # 'Non-Clevel'

print(c_levels.shape)
print(non_cs.shape)
print(c_levels[c_levels.GA_fullVisitorId.isin(non_cs.GA_fullVisitorId)].shape)

(153448, 7)
(390078, 7)
(6915, 7)


In [97]:
c_levels.final_topic_labels.value_counts()[:10]

dogecoin, bitcoin price, ethereum, bitcoin cryptocurrency, cryptocurrency market, tesla     13138
tax hikes, tax rate, bidens, trillion, american families, families plan                     10721
app store, elefherious, fleeceware, apps, epic games, ios                                   10012
dominion, giuliani, election, lindell, trump, lawsuit                                        7328
india, deaths, doses, israel, vaccinated, vaccine                                            6281
apps, chrome, floc, facebook, users, browser                                                 5939
mars, nasa, astronauts, rocket, collins, moon                                                5706
senate, statehood, democrats, republicans, filibuster, 25th amendment                        5540
jenner, mar lago, grenell, election, wright, recall                                          4739
astrazeneca vaccine, johnson johnson, blood clots, vaccines, johnson vaccine, 19 vaccine     3494
Name: final_topic_la

In [98]:
non_cs.final_topic_labels.value_counts()[:10]

dogecoin, bitcoin price, ethereum, bitcoin cryptocurrency, cryptocurrency market, tesla         24418
tax hikes, tax rate, bidens, trillion, american families, families plan                         20776
app store, elefherious, fleeceware, apps, epic games, ios                                       19421
damage, pokemon, players, outriders, enemies, weapon                                            17907
apps, chrome, floc, facebook, users, browser                                                    17207
loan cancellation, student loans, loan forgiveness, loan borrowers, cancel student, congress    13928
dominion, giuliani, election, lindell, trump, lawsuit                                           13067
mars, nasa, astronauts, rocket, collins, moon                                                   11763
foundation, giving focus, trumps, runcie, cuomo, estimated                                      10424
india, deaths, doses, israel, vaccinated, vaccine                                 

### Training vs Prediction EDA check for my sanity 

* Non-noise, Noise ratio is almost same in totally new unseen articles as it is in seen(trained) ones

In [99]:
print(may03_full_pred.shape)
print(m3_in_train.shape)
print(m3_not_in_train.shape)

print()
m3_in_train = may03_full_pred[may03_full_pred.content_natid.isin(full_train_topics.content_natid)]
m3_not_in_train = may03_full_pred[~may03_full_pred.content_natid.isin(full_train_topics.content_natid)]

print("% Repeating from training", round(m3_in_train.shape[0]/may03_full_pred.shape[0], 2))
print("% Repeating from training", round(m3_not_in_train.shape[0]/may03_full_pred.shape[0], 2))

(4199, 15)
(3672, 15)
(527, 15)

% Repeating from training 0.87
% Repeating from training 0.13


In [100]:
print("In train - got topic", round(m3_in_train[m3_in_train.Topic!=-1].shape[0]/m3_in_train.shape[0], 2))
print("In train - got noise", round(m3_in_train[m3_in_train.Topic==-1].shape[0]/m3_in_train.shape[0], 2))

In train - got topic 0.77
In train - got noise 0.23


In [101]:
print("Not in train - got topic", round(m3_not_in_train[m3_not_in_train.Topic!=-1].shape[0]/m3_not_in_train.shape[0], 2))
print("Not in train - got noise", round(m3_not_in_train[m3_not_in_train.Topic==-1].shape[0]/m3_not_in_train.shape[0], 2))

Not in train - got topic 0.71
Not in train - got noise 0.29


In [104]:
m3_noise = may03_full_pred[may03_full_pred.Topic==-1]
print("Noise", m3_noise.shape)

# 85% of noise were also noise in training data. 15% of predicted noise was new articles
print(round(m3_in_train[m3_in_train.Topic==-1].shape[0]/m3_noise.shape[0], 2))
print(round(m3_not_in_train[m3_not_in_train.Topic==-1].shape[0]/m3_noise.shape[0], 2))

Noise (1011, 15)
0.85
0.15


In [105]:
m3_nonnoise = may03_full_pred[may03_full_pred.Topic!=-1]
print("Noise", m3_nonnoise.shape)

# 88% of non-noise were also non-noise in training data. rest 12% of non-noise were unseen articles
print(round(m3_in_train[m3_in_train.Topic!=-1].shape[0]/m3_nonnoise.shape[0], 2))
print(round(m3_not_in_train[m3_not_in_train.Topic!=-1].shape[0]/m3_nonnoise.shape[0], 2))

Noise (3188, 15)
0.88
0.12


* When both have topics
    * 90% articles get same topic in training & prediction <br><br>
    
* When either has noise
    * We are gaining topics on noise articles more than we are losing by predicting on May3 i.e. 
    * training noise, pred non-noise -gained-588. pred noise, training non-noise -lost-278

In [113]:
col_1 = ['content_natid', 'Doc', 'Topic', 'final_topic_labels']
col_2= ['content_natid', 'Doc', 'Topic', 'topic_words']

m3_train = pd.merge(may03_full_pred[col_1], full_train_topics[col_2], on="content_natid", how="inner")
m3_train.shape

(3672, 7)

In [169]:
full_train_topics[full_train_topics.Topic==-1].shape

(2357, 11)

---Both have topics---

In [139]:
print("common - both have topics", m3_train[(m3_train.Topic_x != -1) & (m3_train.Topic_y != -1)].shape)
print("common - training noise, pred noise", m3_train[(m3_train.Topic_x != -1) & (m3_train.Topic_y != -1) 
                                                      & (m3_train.Topic_x == m3_train.Topic_y)].shape, round(1989/2224, 2))

print("common - training noise, pred noise", m3_train[(m3_train.Topic_x != -1) & (m3_train.Topic_y != -1) 
                                                      & (m3_train.Topic_x != m3_train.Topic_y)].shape, round(235/2224, 2))

common - both have topics (2224, 7)
common - training noise, pred noise (1989, 7) 0.89
common - training noise, pred noise (235, 7) 0.11


---Either has noise---

In [142]:
print("either has noise",  m3_train[(m3_train.Topic_x == -1) | (m3_train.Topic_y == -1)].shape)

either has noise (1448, 7)


In [131]:
print("common - training noise", m3_train[m3_train.Topic_y==-1].shape)
print("common - training noise, pred non-noise", m3_train[(m3_train.Topic_x != -1) & (m3_train.Topic_y == -1)].shape)
print("common - training noise, pred noise", m3_train[(m3_train.Topic_x == -1) & (m3_train.Topic_y == -1)].shape)

common - training noise (1170, 7)
common - training noise, pred non-noise (588, 7)
common - training noise, pred noise (582, 7)


In [133]:
print("common - may 3 pred noise", m3_train[m3_train.Topic_x==-1].shape)
print("common - may 3 pred noise, training non-noise", m3_train[(m3_train.Topic_x == -1) & (m3_train.Topic_y != -1)].shape)
print("common - may 3 pred noise, training noise", m3_train[(m3_train.Topic_x == -1) & (m3_train.Topic_y == -1)].shape)

common - may 3 pred noise (860, 7)
common - may 3 pred noise, training non-noise (278, 7)
common - may 3 pred noise, training noise (582, 7)


### Day-over-Day EDA check for my sanity 

* When both have topics
    * 82% articles get same topic day-over-day 
    * remaining 18% either topic could be right - borderline ones it seems 
        * all this will do: change topic size or scoring<br><br>
    
* When either has noise
    * 50% both have same noise
    * rest 50% - almost same gain/loss - seems borderline ones

In [144]:
m3_4 = pd.merge(may03_full_pred[col_1], may04_full_pred[col_1], on="content_natid", how="inner")
m3_4.shape

(3422, 7)

---Both have topics---

In [148]:
print("common - both have topics", m3_4[(m3_4.Topic_x != -1) & (m3_4.Topic_y != -1)].shape)
print("common - both same topics", m3_4[(m3_4.Topic_x != -1) & (m3_4.Topic_y != -1) 
                                                      & (m3_4.Topic_x == m3_4.Topic_y)].shape, round(1949/2380, 2))

print("common - both diff topics", m3_4[(m3_4.Topic_x != -1) & (m3_4.Topic_y != -1) 
                                                      & (m3_4.Topic_x != m3_4.Topic_y)].shape, round(431/2380, 2))

common - both have topics (2380, 7)
common - both same topics (1949, 7) 0.82
common - both diff topics (431, 7) 0.18


In [155]:
m3_4[(m3_4.Topic_x != -1) & (m3_4.Topic_y != -1) & (m3_4.Topic_x != m3_4.Topic_y)][['Doc_x', 'Topic_x', 
                                                                                    'final_topic_labels_x', 'Topic_y',
                                                                                    'final_topic_labels_y']].head(2)

Unnamed: 0,Doc_x,Topic_x,final_topic_labels_x,Topic_y,final_topic_labels_y
2,the new ipad pro 2021 gets even better with tons of new features announced at wwdc. it was just over a year ago that apple unveiled its last ipad pro. the 2020 model wasnt a significant improvement over 2018sit mainly added support for the laptop...,26,"iphone xs, iphone xr, iphone plus, iphone 6s, xs max, vs iphone",68,"m1, rtx 3060, laptop, usb, intel, rx"
34,"blockchain could transform retail, from supply chain and inventory management to product provenance. cryptocurrency and blockchain have been inching closer to the mainstream, with mastercard ma ma and paypal pypl pypl announcing theyll accept tok...",216,"open innovation, digitalization, gpt, gartner, big data, superminds",244,"supply chain, multicloud, iot, tiktok, hyperautomation, analytics"


---Either has noise---

In [161]:
print("either have noise", m3_4[(m3_4.Topic_x==-1) | (m3_4.Topic_y==-1)].shape)
print("both noise", m3_4[(m3_4.Topic_x == -1) & (m3_4.Topic_y == -1)].shape)
print("may3 noise, may 4 non-noise", m3_4[(m3_4.Topic_x != -1) & (m3_4.Topic_y == -1)].shape)
print("may3 non-noise, may 4 noise", m3_4[(m3_4.Topic_x == -1) & (m3_4.Topic_y != -1)].shape)
print(m3_4[(m3_4.Topic_x != -1) & 
           (m3_4.Topic_y == -1)].content_natid.isin(m3_4[(m3_4.Topic_x == -1) & 
                                                         (m3_4.Topic_y != -1)].content_natid).unique())

either have noise (1042, 7)
both noise (565, 7)
may3 noise, may 4 non-noise (239, 7)
may3 non-noise, may 4 noise (238, 7)
[False]


In [164]:
m3_9 = pd.merge(may03_full_pred[col_1], may09_full_pred[col_1], on="content_natid", how="inner")
m3_9.shape

(2249, 7)

In [165]:
print("common - both have topics", m3_9[(m3_9.Topic_x != -1) & (m3_9.Topic_y != -1)].shape)
print("common - both same topics", m3_9[(m3_9.Topic_x != -1) & (m3_9.Topic_y != -1) 
                                                      & (m3_9.Topic_x == m3_9.Topic_y)].shape, round(1949/2380, 2))

print("common - both diff topics", m3_9[(m3_9.Topic_x != -1) & (m3_9.Topic_y != -1) 
                                                      & (m3_9.Topic_x != m3_9.Topic_y)].shape, round(431/2380, 2))

common - both have topics (1549, 7)
common - both same topics (1281, 7) 0.82
common - both diff topics (268, 7) 0.18


In [168]:
print("either have noise", m3_9[(m3_9.Topic_x==-1) | (m3_9.Topic_y==-1)].shape)
print("both noise", m3_9[(m3_9.Topic_x == -1) & (m3_9.Topic_y == -1)].shape)
print("may3 noise, may 9 non-noise", m3_9[(m3_9.Topic_x != -1) & (m3_9.Topic_y == -1)].shape)
print("may3 non-noise, may 9 noise", m3_9[(m3_9.Topic_x == -1) & (m3_9.Topic_y != -1)].shape)
print(m3_9[(m3_9.Topic_x != -1) & 
           (m3_9.Topic_y == -1)].content_natid.isin(m3_9[(m3_9.Topic_x == -1) & 
                                                         (m3_9.Topic_y != -1)].content_natid).unique())

either have noise (700, 7)
both noise (373, 7)
may3 noise, may 9 non-noise (177, 7)
may3 non-noise, may 9 noise (150, 7)
[False]


In [101]:
def replicate_prod_deduplication(df_list, raw_df):
    
    """
    merge earlier days dfs and use that to deduplicate current day's articles 
    
    this function replicates a DB where unique articles and their topic assignments are stored. 
    
    scope of look back period - past 7 days
    """

    concat_dfs = pd.concat(df_list)
    concat_dfs.drop_duplicates("content_natid", keep="first", inplace=True)
    
    updated_df = raw_df[~((raw_df.content_natid.isin(full_train_topics.content_natid_x)) | (raw_df.content_natid.isin(concat_dfs.content_natid)))]
    updated_df.reset_index(drop=True, inplace=True)
    
    return updated_df

In [102]:
print(full_train_topics.shape)

(7238, 13)


In [111]:
opt2_may5 = replicate_prod_deduplication([may03_full_pred, may04_full_pred], 
                                         may05_full_pred) 

print(opt2_may5.shape)

print(opt2_may5[opt2_may5.orig_pred_topic==-1].shape)
print("actual prediction noise", opt2_may5[opt2_may5.orig_pred_topic==-1].shape[0]/opt2_may5.shape[0])

print()
print("Reduced down to:")
print(opt2_may5[opt2_may5.final_topics==-1].shape)
print("actual prediction noise", opt2_may5[opt2_may5.final_topics==-1].shape[0]/opt2_may5.shape[0])

(450, 9)
(237, 9)
actual prediction noise 0.5266666666666666

Reduced down to:
(127, 9)
actual prediction noise 0.2822222222222222


In [112]:
opt2_may6 = replicate_prod_deduplication([may03_full_pred, may04_full_pred, may05_full_pred], 
                                         may06_full_pred) 

print(opt2_may6.shape)

print(opt2_may6[opt2_may6.orig_pred_topic==-1].shape)
print("actual prediction noise", opt2_may6[opt2_may6.orig_pred_topic==-1].shape[0]/opt2_may6.shape[0])

print()
print("Reduced down to:")
print(opt2_may6[opt2_may6.final_topics==-1].shape)
print("actual prediction noise", opt2_may6[opt2_may6.final_topics==-1].shape[0]/opt2_may6.shape[0])

(452, 9)
(246, 9)
actual prediction noise 0.5442477876106194

Reduced down to:
(145, 9)
actual prediction noise 0.32079646017699115
