In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    mean_squared_error, mean_absolute_error, r2_score,
    roc_auc_score, roc_curve
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
pd.set_option('display.float_format', '{:.4f}'.format)

import json
import glob

In [6]:
all_data = []
for file in glob.glob("../data/processed/*.json"):
    print(file)
    with open(file, "r") as f:
        all_data.extend(json.load(f))

df = pd.DataFrame(all_data)
df = df.dropna(thresh=8)
df = df.reset_index(drop=True)

../data/processed\channels0_99.json
../data/processed\channels100_.json
../data/processed\channels3425_.json
../data/processed\channels6727_.json
../data/processed\channelsa10034_.json
../data/processed\channelsa13313_.json
../data/processed\channels_missing.json


In [9]:
df["created_date"] = pd.to_datetime(df["created_date"], format="ISO8601")

In [12]:
df.head(20)

Unnamed: 0,channel_id,channel_name,description,country,defaultLanguage,created_date,category,aggregated_tags,most_common_video_genre,all_video_genres,avg_duration_seconds,avg_seconds_between_uploads,recent_video_titles
0,UCOmHUn--16B90oW2L6FRR3A,BLACKPINK,BLACKPINK Official YouTube Channel\nÎ∏îÎûôÌïëÌÅ¨ Í≥µÏãù Ïú†Ìäú...,KR,,2016-06-29 03:15:23+00:00,"[Electronic music, Pop music, Music of Asia, M...","[JISOO, YG, JENNIE You & Me, LISA, JENNIE Ïú†Ïï§ÎØ∏,...",Music,[Music],211.6,1017992.89,[BLACKPINK - WORLD TOUR [DEADLINE] IN HONG KON...
1,UC3IZKseVpdzPSBaWxBxundA,HYBE LABELS,Welcome to the official YouTube channel of HYB...,KR,,2008-06-04 08:23:22+00:00,"[Pop music, Music, Music of Asia]","[ÌïòÏù¥Î∏å, ÌïòÏù¥Î∏åÎ†àÏù¥Î∏îÏ¶à, HYBE LABELS, HYBE]",Music,[Music],89.6,131588.11,[SANTOS BRAVOS ‚ÄúKAWASAKI (&TEAM Remix)‚Äù Lyric ...
2,UCF1JIbMUs6uqoZEY1Haw0GQ,Shemaroo,"Welcome to ShemarooEnt, one of the finest dest...",IN,,2007-09-01 11:44:51+00:00,"[Film, Entertainment]","[salman khan movies, ramcharana moves, Mega Po...",Entertainment,[Entertainment],5336.1,45200.0,[Mega Power Star Ram Charan üëë | Zanjeer (4K Ac...
3,UCYiGq8XF7YQD00x7wAd62Zg,JuegaGerman,Lento pero seguro.,CL,,2013-05-19 00:09:13+00:00,"[Action game, Video game culture, Action-adven...","[revenia, juega german, juego de miedo, click ...",Gaming,[Gaming],2046.9,280466.0,"[Fotos Tomadas En El Momento PERFECTO üì∏, Traba..."
4,UC4NALVCmcmL5ntpV0thoH6w,LooLoo Kids - Nursery Rhymes and Children's Songs,LooLoo Kidsüíñ is an educational YouTube channel...,US,en,2014-08-05 20:15:33+00:00,"[Entertainment, Music, Film]","[kids videos, children songs, farm song nurser...",Music,[Music],148.5,181623.89,[Old Macdonald Had a Farm Song + Johny Johny Y...
5,UCJrDMFOdv1I2k8n9oK_V21w,Tips Official,The proud history of Tips Music Limited (Forme...,IN,,2007-05-22 10:13:28+00:00,"[Film, Music, Music of Asia, Pop music]","[Itna Main Chahoon Tujhe Koi Kisi Ko Na Chahe,...",Music,[Music],2059.1,16523.78,[90's Evergreen Songs | 90's Blockbuster Songs...
6,UCX6OQ3DkcsbYNE6H8uQQuVA,MrBeast,SUBSCRIBE FOR A COOKIE!\nNew MrBeast or MrBeas...,US,en,2012-02-20 00:43:50+00:00,"[Entertainment, Lifestyle (sociology)]",[],Entertainment,[Entertainment],303.8,384399.78,"[Every Step You Take, Win $1,000, Surprising M..."
7,UCK1i2UviaXLUNrZlAFpw_jA,El Reino Infantil,El Reino Infantil es la comunidad digital para...,AR,,2011-06-02 16:20:07+00:00,"[Film, Music, Entertainment]","[mes de la amistad, bebe looloo kids, youtube,...",Music,"[Entertainment, Music]",466.4,86800.0,[Cinco Patitos ü™ø | FAMILIA BLU üíô Canci√≥n Infan...
8,UCIwFjwMjI0y7PDBVEO9-bkQ,Justin Bieber,,CA,,2007-01-15 21:17:27+00:00,"[Soul music, Music, Pop music, Electronic musi...","[sorry, purpose, believe, anyone, beauty and a...",Music,[Music],179.1,307323.22,"[Justin Bieber - BAD HONEY, Justin Bieber - SP..."
9,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,Sony Entertainment Television is one of the le...,IN,,2006-09-20 22:24:59+00:00,"[Music of Asia, Entertainment, Television prog...","[cid, new business ideas, business deals, Kuna...",Entertainment,[Entertainment],2678.6,4802.44,[Wheel Of Fortune with Akshay Kumar | Mon-Fri ...


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15743 entries, 0 to 15742
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype              
---  ------                       --------------  -----              
 0   channel_id                   15743 non-null  object             
 1   channel_name                 15743 non-null  object             
 2   description                  15743 non-null  object             
 3   country                      13977 non-null  object             
 4   defaultLanguage              1725 non-null   object             
 5   created_date                 15743 non-null  datetime64[ns, UTC]
 6   category                     15743 non-null  object             
 7   aggregated_tags              15743 non-null  object             
 8   most_common_video_genre      15743 non-null  object             
 9   all_video_genres             15743 non-null  object             
 10  avg_duration_seconds         15743 non-null  f

In [None]:
num_cols = []
cat_cols = []
drop_cols = ["channel_id, channel_name, default_Language"]