In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
import matplotlib.pyplot as plt

import json
import glob

import pickle

In [2]:
pd.set_option('display.float_format', '{:.4f}'.format)

In [3]:
all_data = []
for file in glob.glob("../data/processed/*.json"):
    print(file)
    with open(file, "r") as f:
        all_data.extend(json.load(f))

df = pd.DataFrame(all_data)
df = df.dropna(thresh=8)
df = df.reset_index(drop=True)

../data/processed\channels0_99.json
../data/processed\channels100_.json
../data/processed\channels3425_.json
../data/processed\channels6727_.json
../data/processed\channelsa10034_.json
../data/processed\channelsa13313_.json
../data/processed\channels_missing.json


In [4]:
df["created_date"] = pd.to_datetime(df["created_date"], format="ISO8601")

In [5]:
time_cols = ["created_date"]
num_cols = ["avg_duration_seconds", "avg_seconds_between_uploads"]
basic_cat_cols = ["country", "most_common_video_genre"]
multi_cat_cols = ["category", "all_video_genres"]
text_cols = ["description", "aggregated_tags", "recent_video_titles"]
drop_cols = ["channel_id", "channel_name", "defaultLanguage"]
set(time_cols + num_cols + basic_cat_cols + multi_cat_cols + text_cols + drop_cols) == set(df.columns)

True

TODO: investigate below scaling step

In [6]:
df["avg_seconds_between_uploads"] = df["avg_seconds_between_uploads"].fillna(df["avg_seconds_between_uploads"].max())

In [7]:
class DateTimeToPosix(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return (pd.to_datetime(X.iloc[:, 0]).astype(int) // 10**9).values.reshape(-1, 1)

In [8]:
# MLB does not have fit/transform. must wrap in custom transformer, give it fit/transform, and col transformer will accept
class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()
    
    def fit(self, X, y=None):
        self.mlb.fit(X.iloc[:, 0])
        return self
    
    def transform(self, X, y=None):
        return self.mlb.transform(X.iloc[:, 0])


In [None]:
ct = make_column_transformer(
    (DateTimeToPosix(), time_cols),
    (StandardScaler(), num_cols),
    (make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder()), basic_cat_cols), #TODO: almost certainly a better way to impute countries
    (MultiLabelBinarizerTransformer(), ["category"]),
    (MultiLabelBinarizerTransformer(), ["all_video_genres"]),
    (TODO, text_cols)
    ("drop", drop_cols)
)

In [9]:
df.head()

Unnamed: 0,channel_id,channel_name,description,country,defaultLanguage,created_date,category,aggregated_tags,most_common_video_genre,all_video_genres,avg_duration_seconds,avg_seconds_between_uploads,recent_video_titles
0,UCOmHUn--16B90oW2L6FRR3A,BLACKPINK,BLACKPINK Official YouTube Channel\nÎ∏îÎûôÌïëÌÅ¨ Í≥µÏãù Ïú†Ìäú...,KR,,2016-06-29 03:15:23+00:00,"[Electronic music, Pop music, Music of Asia, M...","[JISOO, YG, JENNIE You & Me, LISA, JENNIE Ïú†Ïï§ÎØ∏,...",Music,[Music],211.6,1017992.89,[BLACKPINK - WORLD TOUR [DEADLINE] IN HONG KON...
1,UC3IZKseVpdzPSBaWxBxundA,HYBE LABELS,Welcome to the official YouTube channel of HYB...,KR,,2008-06-04 08:23:22+00:00,"[Pop music, Music, Music of Asia]","[ÌïòÏù¥Î∏å, ÌïòÏù¥Î∏åÎ†àÏù¥Î∏îÏ¶à, HYBE LABELS, HYBE]",Music,[Music],89.6,131588.11,[SANTOS BRAVOS ‚ÄúKAWASAKI (&TEAM Remix)‚Äù Lyric ...
2,UCF1JIbMUs6uqoZEY1Haw0GQ,Shemaroo,"Welcome to ShemarooEnt, one of the finest dest...",IN,,2007-09-01 11:44:51+00:00,"[Film, Entertainment]","[salman khan movies, ramcharana moves, Mega Po...",Entertainment,[Entertainment],5336.1,45200.0,[Mega Power Star Ram Charan üëë | Zanjeer (4K Ac...
3,UCYiGq8XF7YQD00x7wAd62Zg,JuegaGerman,Lento pero seguro.,CL,,2013-05-19 00:09:13+00:00,"[Action game, Video game culture, Action-adven...","[revenia, juega german, juego de miedo, click ...",Gaming,[Gaming],2046.9,280466.0,"[Fotos Tomadas En El Momento PERFECTO üì∏, Traba..."
4,UC4NALVCmcmL5ntpV0thoH6w,LooLoo Kids - Nursery Rhymes and Children's Songs,LooLoo Kidsüíñ is an educational YouTube channel...,US,en,2014-08-05 20:15:33+00:00,"[Entertainment, Music, Film]","[kids videos, children songs, farm song nurser...",Music,[Music],148.5,181623.89,[Old Macdonald Had a Farm Song + Johny Johny Y...
