In [None]:
Assuming URL slugs have some semantic content, can we harvest that?

In [38]:
import fasttext
import os
import pandas as pd
import re
import sys
from typing import List, Tuple


import tldextract

In [5]:
FILES = {
    'wiki.en.bin':                             '/home/pmccarthy/projects/audiencefinder/data/wiki.en.bin',          
}

In [3]:
JOBNAME = 'MLE-4610 - Full URLs in Fasttext'

In [4]:
# os.environ['SPARK_HOME'] = '/home/pmccarthy/nas/opt/spark'
# os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.6'
# os.environ['PYSPARK_PYTHON'] = './ip_env/ip_env/bin/python'

os.environ['SPARK_HOME'] = '/home/pmccarthy/nas/opt/spark-2.4.7-bin-hadoop2.7'
os.environ['PYSPARK_PYTHON'] = './py37minimal_env/py37minimal/bin/python'

dist_archives = 'hdfs:///user/pmccarthy/conda/py37minimal.zip#py37minimal_env'

# dist_archives = 'hdfs:///user/pmccarthy/conda/ip_env.zip#ip_env'

sys.path.insert(0,os.path.join(os.environ['SPARK_HOME'],'python','lib','py4j-src.zip'))
sys.path.append(os.path.join(os.environ['SPARK_HOME'],'python'))

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import window as W


spark = (SparkSession
         .builder
         .enableHiveSupport()
         .appName(f"{os.uname()[1].split('.')[0]}: {JOBNAME}")
         .config('spark.master','yarn')
         .config('spark.yarn.deployMode','client')
         .config('spark.executor.cores',1)
         .config('spark.executor.memory','2g')
         .config('spark.driver.memory','5g')
         .config('spark.dynamicAllocation.maxExecutors',1500)
         .config('spark.yarn.dist.archives', dist_archives)
         .config('spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version','2')
         .config('spark.hadoop.parquet.enable.summary-metadata','false')
         .config('spark.sql.parquet.mergeSchema','false')
         .config('spark.sql.parquet.filterPushdown','true')
         .config('spark.sql.hive.metastorePartitionPruning','true')
         .getOrCreate())

In [6]:
ft_model = fasttext.load_model(FILES['wiki.en.bin'])

In [12]:
def partitions_to_df(partition_list: List[Tuple[str]]) -> pd.DataFrame:
    # extract column names from first row which looks like    
    #
    # [('offerid=6007/levelid=40/hit_date=20210213',),
    
    first_row_first_tuple = partition_list[0][0]
    
    colnames = [x.split('=')[0] for x in first_row_first_tuple.split('/')]
    
    # keep every other ID, which contain values
    odd_ids = list(range(1,2*len(colnames),2))
    
    df = (
        pd.DataFrame(
            # make all delimiters into '/' and then split on it
            map(lambda x: x[0].replace("=","/").split('/'), partition_list),
        )[odd_ids]
    )
    
    df.columns = colnames
    
    return df

In [20]:
parts = spark.sql("show partitions stash_dice.content").toPandas()

partitions_df = partitions_to_df(
    [(x,) for x in parts['partition'].tolist()])

In [24]:
N_LESS_1_DATE = sorted(partitions_df['hit_date_utc'].unique().tolist())[-2]

In [33]:
some_urls_df = (
    spark.table('stash_dice.content')
    .filter(F.col('hit_date_utc')==N_LESS_1_DATE)
    .select('url_host','url_path')
).show(n=40,truncate=False)

+------------------------+----------------------------------------------------------------------------------------------------------------+
|url_host                |url_path                                                                                                        |
+------------------------+----------------------------------------------------------------------------------------------------------------+
|en.365economist.com     |/30-brilliant-solutions-that-will-make-the-best-of-your-limit                                                   |
|mail.yahoo.com          |                                                                                                                |
|www.bostonherald.com    |                                                                                                                |
|finance.yahoo.com       |                                                                                                                |
|finance.yahoo.com  

In [109]:
spark.stop()

In [91]:
paths = """
+------------------------+----------------------------------------------------------------------------------------------------------------+
|url_host                |url_path                                                                                                        |
+------------------------+----------------------------------------------------------------------------------------------------------------+
|en.365economist.com     |/30-brilliant-solutions-that-will-make-the-best-of-your-limit                                                   |
|mail.yahoo.com          |                                                                                                                |
|www.bostonherald.com    |                                                                                                                |
|finance.yahoo.com       |                                                                                                                |
|finance.yahoo.com       |/quote/MRNA                                                                                                     |
|www.scarymommy.com      |/having-baby-during-pandemic-emotional                                                                          |
|www.minq.com            |/lifestyle/2563101/people-share-the-time-they-unapologetically-stood-up-to-their-entitled-in-laws               |
|en.kueez.com            |/50-of-the-most-epic-and-hilarious-sports-bloopers-of-all-tim                                                   |
|www.minq.com            |/lifestyle/2563101/people-share-the-time-they-unapologetically-stood-up-to-their-entitled-in-laws               |
|mobile.ghanaweb.com     |/GhanaHomePage/NewsArchive                                                                                      |
|mobile.ghanaweb.com     |/GhanaHomePage/NewsArchive                                                                                      |
|www.sparknotes.com      |/lit/flies/section2                                                                                             |
|mail.yahoo.com          |                                                                                                                |
|en.365economist.com     |/30-brilliant-solutions-that-will-make-the-best-of-your-limit                                                   |
|outlook.live.com        |                                                                                                                |
|brobible.com            |/sports/article/vivian-flores-butterfly-424-catfishing-allegations                                              |
|www.space.com           |/space-race.html                                                                                                |
|www.yahoo.com           |                                                                                                                |
|myhealthgazette.com     |/45-funny-air-travel-facts                                                                                      |
|medical-news.org        |/30-popular-foods-children-shouldnt-be-fed/12938/10                                                             |
|www.drivepedia.com      |/trending/finds-mysterious-internet-fb/6                                                                        |
|www.cbc.ca              |/news/technology/retail-chemicals-report-1.5976620                                                              |
|en.wackojaco.com        |/40-places-around-the-world-that-are-forbidden-to-travel-to                                                     |
|weather.com             |/weather/tenday/l/ff548795d9b9049e515ce9e8eee210ad30b0bcbf50339f14198db3910ffb06ce                              |
|word.tips               |/unscramble/eaofsod                                                                                             |
|mail.yahoo.com          |                                                                                                                |
|mail.yahoo.com          |                                                                                                                |
|www.wsfa.com            |/2021/04/19/killed-injured-sunday-montgomery-shooting                                                           |
|www.fox23.com           |/news/video-several-teachers-staff-members-resigning-catoosa-public-schools/525e9b3e-2c15-4495-8db2-72b80cb0bedb|
|www.theprimarymarket.com|/view/brilliant-japanese-design-tpm                                                                             |
|www.habsetlnh.com       |/Alex-Romanov-sonne-les-cloches-a-Jujhar-Khaira-39233                                                           |
|clashofclans.fandom.com |/wiki/O.T.T.O                                                                                                   |
|www.ranker.com          |/list/colonial-mash-v2/mel-judson                                                                               |
|comicbook.com           |/tv-shows/news/ginny-and-georgia-season-2-netflix-renewed-streaming                                             |
|finance.yahoo.com       |/quote/DOGE-USD                                                                                                 |
|mail.yahoo.com          |                                                                                                                |
|mail.yahoo.com          |                                                                                                                |
|mail.yahoo.com          |                                                                                                                |
|www.blackpeoplemeet.com |/v3/profile                                                                                                     |
|weather.com             |/weather/hourbyhour/l/c44f2773cda02fec98442b446569b1a196b72bcb73ca2acdcf864db6e4c2fc18                          |
+------------------------+----------------------------------------------------------------------------------------------------------------+
"""

In [106]:
for line in paths.split('\n')[4:-2]:
    
    pipe_split = line.split('|')
    domain = tldextract.extract(pipe_split[1]).domain
    line_path = pipe_split[2]
    
    sanitized = re.sub('\s+',' ',
           re.sub('[-|\./{0-9}]',' ',
                  domain+' '+line_path
            )
          )
    print(domain+' '+line_path)
    print(sanitized)
    print("\n".join([str(x) for x in ft_model.get_nearest_neighbors(sanitized)]))
    print('===============================================')

365economist /30-brilliant-solutions-that-will-make-the-best-of-your-limit                                                   
 economist brilliant solutions that will make the best of your limit 
(0.843499481678009, 'newcenturyinfusionsolutions')
(0.8428196907043457, 'completedrugtestingsolutions')
(0.8423277735710144, 'externalurlsmatchingthislistwillbeblockedwhenaddedtoapage')
(0.8409328460693359, 'thedietsolutionprogram')
(0.8395974040031433, 'hauntinginconnecticutultimatefansite')
(0.8382377624511719, 'americanbeautytools')
(0.8352340459823608, 'thevirtualsexreview')
(0.8342839479446411, 'alfawebsolutions')
(0.8342727422714233, 'americanswhotellthetruth')
(0.8329175114631653, 'chevycitationforever')
yahoo                                                                                                                 
yahoo 
(0.7627455592155457, 'yahoot')
(0.7393791079521179, 'yahood')
(0.7232840061187744, 'yahoofs')
(0.7071269750595093, 'yahoo/ap')
(0.7052602171897888, 'yahoo')
(0.7

In [107]:
for line in paths.split('\n')[4:-2]:
    
    pipe_split = line.split('|')
    domain = tldextract.extract(pipe_split[1]).domain
    line_path = pipe_split[2]
    
    sanitized = re.sub('\s+','_',
           re.sub('[-|\./{0-9}]',' ',
                  domain+' '+line_path
            )
          )
    print(domain+' '+line_path)
    print(sanitized)
    print("\n".join([str(x) for x in ft_model.get_nearest_neighbors(sanitized)]))
    print('===============================================')

365economist /30-brilliant-solutions-that-will-make-the-best-of-your-limit                                                   
_economist_brilliant_solutions_that_will_make_the_best_of_your_limit_
(0.930489182472229, '_the_birds_the_wizard_and_the_vanishing_cabinet_rachezee_speaks_part_')
(0.9297161102294922, 'com/david_houle/howard_bloom_on_the_future_of_energy_turn_poisons_into_pleasure_and_excrement_into_energy')
(0.9279316663742065, 'the_scientific_consensus_holds_that_currently_marketed_gm_food_poses_no_greater_risk_than_conventional_food')
(0.9266883134841919, 'candaules,_king_of_lydia,_shews_his_wife_by_stealth_to_gyges,_one_of_his_ministers,_as_she_goes_to_bed')
(0.924230694770813, '_lb_meat_beef_modified_in_fat_content_with_tomato_and_or_catsup_on_bun')
(0.9204339385032654, 'com/big_science_gambles/suit_alleges_cern_in_violation_of_human_rights')
(0.9203583002090454, 'interpretive_work,_newfoundland_rangers_monthly_reports,_port_hope_simpson,_analysis_and_interpretation_')
(0.9