## config

In [32]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
import matplotlib.pyplot as plt
import re

from sklearn.feature_extraction.text import CountVectorizer

import json
import glob

import pickle

In [2]:
pd.set_option('display.float_format', '{:.4f}'.format)

## from EDA

In [3]:
all_data = []
for file in glob.glob("../data/processed/*.json"):
    print(file)
    with open(file, "r") as f:
        all_data.extend(json.load(f))

df = pd.DataFrame(all_data)
df = df.dropna(thresh=8)
df = df.reset_index(drop=True)

../data/processed\channels0_99.json
../data/processed\channels100_.json
../data/processed\channels3425_.json
../data/processed\channels6727_.json
../data/processed\channelsa10034_.json
../data/processed\channelsa13313_.json
../data/processed\channels_missing.json


In [4]:
df["created_date"] = pd.to_datetime(df["created_date"], format="ISO8601")

In [5]:
time_cols = ["created_date"]
num_cols = ["avg_duration_seconds", "avg_seconds_between_uploads"]
basic_cat_cols = ["country", "most_common_video_genre"]
multi_cat_cols = ["category", "all_video_genres"]
text_cols = ["description", "aggregated_tags", "recent_video_titles"]
drop_cols = ["channel_id", "channel_name", "defaultLanguage"]
set(time_cols + num_cols + basic_cat_cols + multi_cat_cols + text_cols + drop_cols) == set(df.columns)

True

TODO: investigate below scaling step

In [6]:
df["avg_seconds_between_uploads"] = df["avg_seconds_between_uploads"].fillna(df["avg_seconds_between_uploads"].max())

In [7]:
class DateTimeToPosix(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return (pd.to_datetime(X.iloc[:, 0]).astype(int) // 10**9).values.reshape(-1, 1)

In [8]:
# MLB does not have fit/transform. must wrap in custom transformer, give it fit/transform, and col transformer will accept
class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()
    
    def fit(self, X, y=None):
        self.mlb.fit(X.iloc[:, 0])
        return self
    
    def transform(self, X, y=None):
        return self.mlb.transform(X.iloc[:, 0])


## text embeddings experimentation

### nltk/spacy playing

In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import spacy

In [10]:
nlp = spacy.load("en_core_web_md")
doc = nlp("pineapple") 
doc.vector[:10]

array([-0.63358,  0.12266,  0.47232, -0.22974, -0.26307,  0.56499,
       -0.72338,  0.16736,  0.4203 ,  0.93788], dtype=float32)

In [15]:
text = (
    "UBC is one of the well known universities in British Columbia. "
    "UBC CS teaching team is truly multicultural!! "
    "Dr. Toti completed her Ph.D. in Italy."
    "Dr. Moosvi, Dr. Kolhatkar, and Dr. Ola completed theirs in Canada."
    "Dr. Heeren and Dr. L√©cuyer completed theirs in the U.S."
)
print(text.split("."))

sent_tokenized = sent_tokenize(text)
print(sent_tokenized)

['UBC is one of the well known universities in British Columbia', ' UBC CS teaching team is truly multicultural!! Dr', ' Toti completed her Ph', 'D', ' in Italy', 'Dr', ' Moosvi, Dr', ' Kolhatkar, and Dr', ' Ola completed theirs in Canada', 'Dr', ' Heeren and Dr', ' L√©cuyer completed theirs in the U', 'S', '']
['UBC is one of the well known universities in British Columbia.', 'UBC CS teaching team is truly multicultural!!', 'Dr. Toti completed her Ph.D. in Italy.Dr.', 'Moosvi, Dr. Kolhatkar, and Dr. Ola completed theirs in Canada.Dr.', 'Heeren and Dr. L√©cuyer completed theirs in the U.S.']


In [16]:
print("Splitting on whitespace: ", [sent.split() for sent in sent_tokenized])
print("Splitting on whitespace: ", [word_tokenize(sent) for sent in sent_tokenized])

Splitting on whitespace:  [['UBC', 'is', 'one', 'of', 'the', 'well', 'known', 'universities', 'in', 'British', 'Columbia.'], ['UBC', 'CS', 'teaching', 'team', 'is', 'truly', 'multicultural!!'], ['Dr.', 'Toti', 'completed', 'her', 'Ph.D.', 'in', 'Italy.Dr.'], ['Moosvi,', 'Dr.', 'Kolhatkar,', 'and', 'Dr.', 'Ola', 'completed', 'theirs', 'in', 'Canada.Dr.'], ['Heeren', 'and', 'Dr.', 'L√©cuyer', 'completed', 'theirs', 'in', 'the', 'U.S.']]
Splitting on whitespace:  [['UBC', 'is', 'one', 'of', 'the', 'well', 'known', 'universities', 'in', 'British', 'Columbia', '.'], ['UBC', 'CS', 'teaching', 'team', 'is', 'truly', 'multicultural', '!', '!'], ['Dr.', 'Toti', 'completed', 'her', 'Ph.D.', 'in', 'Italy.Dr', '.'], ['Moosvi', ',', 'Dr.', 'Kolhatkar', ',', 'and', 'Dr.', 'Ola', 'completed', 'theirs', 'in', 'Canada.Dr', '.'], ['Heeren', 'and', 'Dr.', 'L√©cuyer', 'completed', 'theirs', 'in', 'the', 'U.S', '.']]


### embedding/dataset pruning

#### list -> free text

In [44]:
df["aggregated_tags"] = df["aggregated_tags"].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
df["recent_video_titles"] = df["recent_video_titles"].apply(lambda x: " ".join(x) if isinstance(x, list) else x)

In [48]:
df.head()

Unnamed: 0,channel_id,channel_name,description,country,defaultLanguage,created_date,category,aggregated_tags,most_common_video_genre,all_video_genres,avg_duration_seconds,avg_seconds_between_uploads,recent_video_titles
0,UCOmHUn--16B90oW2L6FRR3A,BLACKPINK,BLACKPINK Official YouTube Channel\nÎ∏îÎûôÌïëÌÅ¨ Í≥µÏãù Ïú†Ìäú...,KR,,2016-06-29 03:15:23+00:00,"[Electronic music, Pop music, Music of Asia, M...",JISOO YG JENNIE You & Me LISA JENNIE Ïú†Ïï§ÎØ∏ ÏôÄÏù¥ÏßÄ Î∏î...,Music,[Music],211.6,1017992.89,BLACKPINK - WORLD TOUR [DEADLINE] IN HONG KONG...
1,UC3IZKseVpdzPSBaWxBxundA,HYBE LABELS,Welcome to the official YouTube channel of HYB...,KR,,2008-06-04 08:23:22+00:00,"[Pop music, Music, Music of Asia]",ÌïòÏù¥Î∏å ÌïòÏù¥Î∏åÎ†àÏù¥Î∏îÏ¶à HYBE LABELS HYBE,Music,[Music],89.6,131588.11,SANTOS BRAVOS ‚ÄúKAWASAKI (&TEAM Remix)‚Äù Lyric V...
2,UCF1JIbMUs6uqoZEY1Haw0GQ,Shemaroo,"Welcome to ShemarooEnt, one of the finest dest...",IN,,2007-09-01 11:44:51+00:00,"[Film, Entertainment]",salman khan movies ramcharana moves Mega Power...,Entertainment,[Entertainment],5336.1,45200.0,Mega Power Star Ram Charan üëë | Zanjeer (4K Act...
3,UCYiGq8XF7YQD00x7wAd62Zg,JuegaGerman,Lento pero seguro.,CL,,2013-05-19 00:09:13+00:00,"[Action game, Video game culture, Action-adven...",revenia juega german juego de miedo click to c...,Gaming,[Gaming],2046.9,280466.0,Fotos Tomadas En El Momento PERFECTO üì∏ Trabajo...
4,UC4NALVCmcmL5ntpV0thoH6w,LooLoo Kids - Nursery Rhymes and Children's Songs,LooLoo Kidsüíñ is an educational YouTube channel...,US,en,2014-08-05 20:15:33+00:00,"[Entertainment, Music, Film]",kids videos children songs farm song nursery r...,Music,[Music],148.5,181623.89,Old Macdonald Had a Farm Song + Johny Johny Ye...


In [51]:
df["country"].value_counts()

country
US    4930
IN    2307
GB    1667
CA    1160
AU     958
      ... 
LA       1
OM       1
HN       1
ME       1
UM       1
Name: count, Length: 109, dtype: int64

In [50]:
english_speaking = ['US', 'GB', 'CA', 'AU', 'IE', 'NZ', 'SG', 'ZA', 'NG', 'GH', 'JM', 'VI', 'BM', 'AG']

In [52]:
df_english_country = df[df["country"].isin(english_speaking)]

In [57]:
def is_english(text):
    if not isinstance(text, str):
        return True
    # Allow ASCII + common special chars + emojis
    return bool(re.match(r'^[\x00-\x7F\u00C0-\u024F\u2000-\u206F\u2100-\u214F\u2190-\u21FF\u2600-\u27BF\U0001F300-\U0001FAFF¬©¬Æ‚Ñ¢‚Äî‚Ä¶‚Ä¢""'']*$', text))

In [58]:
mask = (
    df["description"].apply(is_english) &
    df["aggregated_tags"].apply(is_english) &
    df["recent_video_titles"].apply(is_english) 
)

df_english_regex  = df[mask]

In [59]:
print(f"rows after eliminating by country: {len(df_english_country)}")
print(f"rows after  eliminating by regex: {len(df_english_regex)}")

rows after eliminating by country: 8861
rows after  eliminating by regex: 9759


In [60]:
df_english_regex.head()

Unnamed: 0,channel_id,channel_name,description,country,defaultLanguage,created_date,category,aggregated_tags,most_common_video_genre,all_video_genres,avg_duration_seconds,avg_seconds_between_uploads,recent_video_titles
6,UCX6OQ3DkcsbYNE6H8uQQuVA,MrBeast,SUBSCRIBE FOR A COOKIE!\nNew MrBeast or MrBeas...,US,en,2012-02-20 00:43:50+00:00,"[Entertainment, Lifestyle (sociology)]",,Entertainment,[Entertainment],303.8,384399.78,"Every Step You Take, Win $1,000 Surprising My ..."
8,UCIwFjwMjI0y7PDBVEO9-bkQ,Justin Bieber,,CA,,2007-01-15 21:17:27+00:00,"[Soul music, Music, Pop music, Electronic musi...",sorry purpose believe anyone beauty and a beat...,Music,[Music],179.1,307323.22,Justin Bieber - BAD HONEY Justin Bieber - SPEE...
12,UC56gTxNs4f9xZ7Pa2i5xNzg,Sony Music India,Welcome to Sony Music India's official YouTube...,IN,,2009-09-02 23:16:22+00:00,"[Soul music, Pop music, Music, Music of Asia, ...",panghat song best of ajay atul marathi songs j...,Music,[Music],298.1,19199.33,"Roohi - Audio Jukebox | Nadiyon Paar, Panghat ..."
13,UC0C-w0YjGpqDXGB8IHb662A,Ed Sheeran,Ed Sheeran x\n\nPlay - Out Now\n,,,2006-08-08 13:59:50+00:00,"[Pop music, Music]",ed sheeran live performance perth ed sheeran t...,Music,[Music],55.8,554464.78,There‚Äôs a new mayor in town Playing a throwbac...
16,UC9CoOnJkIBMdeijd9qYoT_g,Ariana Grande,eternal sunshine deluxe ‚ô° üì¶ out now\n,,,2007-01-22 01:53:12+00:00,"[Music, Pop music, Electronic music]",ariana grande brighter days ahead behind the s...,Music,[Music],233.5,1427546.22,Ariana Grande - dandelion Ariana Grande - eter...


### putting it together

In [None]:
ct = make_column_transformer(
    (DateTimeToPosix(), time_cols),
    (StandardScaler(), num_cols),
    (make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder()), basic_cat_cols), #TODO: almost certainly a better way to impute countries
    (MultiLabelBinarizerTransformer(), ["category"]),
    (MultiLabelBinarizerTransformer(), ["all_video_genres"]),
    (TODO, text_cols)
    ("drop", drop_cols)
)