In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    mean_squared_error, mean_absolute_error, r2_score,
    roc_auc_score, roc_curve
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
pd.set_option('display.float_format', '{:.4f}'.format)

In [7]:
df = pd.read_csv("../data/raw/youtube_channel_info_v2.csv")
df.head()

Unnamed: 0,channel_name,channel_id,view_count,category,country,defaultLanguage,subscriber_count,created_date,description,custom_url,thumbnail,video_count,videos_last_30_days,views_last_30_days,uploads_playlist_id,last_10_video_ids
0,BLACKPINK,UCOmHUn--16B90oW2L6FRR3A,39962585446,"Music of Asia, Pop music, Music, Electronic music",KR,,99000000,2016-06-29T03:15:23Z,BLACKPINK Official YouTube Channel 블랙핑크 공식 유튜브...,@blackpink,https://yt3.ggpht.com/U3VrCkKjzTpQ3VYv4SCPjNfD...,636,1,3256869,UUOmHUn--16B90oW2L6FRR3A,"W-9-keHNwV8,pAeqY-TEZc0,WB_AOdAEuBM,zTnAvaoHR4..."
1,HYBE LABELS,UC3IZKseVpdzPSBaWxBxundA,41604896923,"Hip hop music, Pop music, Music, Music of Asia",KR,,78700000,2008-06-04T08:23:22Z,Welcome to the official YouTube channel of HYB...,@hybelabels,https://yt3.ggpht.com/ytc/AIdro_l8g0yRFG8xoe_q...,2817,79,46074833,UU3IZKseVpdzPSBaWxBxundA,"NPqiynCpqgI,nbOuR-KYZy8,xf8N3JYEhV4,iIWKCYP2zY..."
2,BILLIE EILISH,UCVNE660NcgYzi18LwwUZb7Q,14316364,,,,82300,2019-01-18T05:14:32Z,,@billieeilish8477,https://yt3.ggpht.com/ytc/AIdro_n3NfvUAVtcUJvW...,1,0,0,UUVNE660NcgYzi18LwwUZb7Q,C9odQay_d6s
3,Shemaroo,UCF1JIbMUs6uqoZEY1Haw0GQ,27011004224,"Entertainment, Film",IN,,60400000,2007-09-01T11:44:51Z,"Welcome to ShemarooEnt, one of the finest dest...",@shemaroo,https://yt3.ggpht.com/ytc/AIdro_mhdF4RNZM9OHP-...,13374,63,10560897,UUF1JIbMUs6uqoZEY1Haw0GQ,"KPYx-APpvo4,HscCQYIX4fg,TP64Sacd5Nw,G8a5vnD2A0..."
4,JuegaGerman,UCYiGq8XF7YQD00x7wAd62Zg,17573028501,"Role-playing video game, Action game, Video ga...",CL,,54200000,2013-05-19T00:09:13Z,Lento pero seguro.,@juegagerman,https://yt3.ggpht.com/vOsrLzWD4z1dbr470nEXydi3...,2368,12,22980501,UUYiGq8XF7YQD00x7wAd62Zg,"JjWksJW1Z0M,_NhOyRPXA5s,EYXTxxjU0Ks,5eAyv6b84p..."


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15830 entries, 0 to 15829
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   channel_name         15830 non-null  object
 1   channel_id           15830 non-null  object
 2   view_count           15830 non-null  int64 
 3   category             15722 non-null  object
 4   country              14030 non-null  object
 5   defaultLanguage      1725 non-null   object
 6   subscriber_count     15830 non-null  int64 
 7   created_date         15830 non-null  object
 8   description          14929 non-null  object
 9   custom_url           15808 non-null  object
 10  thumbnail            15830 non-null  object
 11  video_count          15830 non-null  int64 
 12  videos_last_30_days  15830 non-null  int64 
 13  views_last_30_days   15830 non-null  int64 
 14  uploads_playlist_id  15830 non-null  object
 15  last_10_video_ids    15774 non-null  object
dtypes: i

In [13]:
df = df.rename(columns={"views_last_30_days":"views_last_30_videos"})

In [12]:
df.iloc[0:5]["thumbnail"]

0    https://yt3.ggpht.com/U3VrCkKjzTpQ3VYv4SCPjNfD...
1    https://yt3.ggpht.com/ytc/AIdro_l8g0yRFG8xoe_q...
2    https://yt3.ggpht.com/ytc/AIdro_n3NfvUAVtcUJvW...
3    https://yt3.ggpht.com/ytc/AIdro_mhdF4RNZM9OHP-...
4    https://yt3.ggpht.com/vOsrLzWD4z1dbr470nEXydi3...
Name: thumbnail, dtype: object

In [None]:
df.iloc[0:5]["uploads_playlist_id"]

0    UUOmHUn--16B90oW2L6FRR3A
1    UU3IZKseVpdzPSBaWxBxundA
2    UUVNE660NcgYzi18LwwUZb7Q
3    UUF1JIbMUs6uqoZEY1Haw0GQ
4    UUYiGq8XF7YQD00x7wAd62Zg
Name: uploads_playlist_id, dtype: object

In [19]:
df.iloc[0]["last_10_video_ids"]

'W-9-keHNwV8,pAeqY-TEZc0,WB_AOdAEuBM,zTnAvaoHR4I,kAD9usNFh40,_MffnMh2tas,TL4KxShGOZo,7QILfdCO6C4,sOrVRJNMJn8,P169hsXjYQs'

In [20]:
df.columns

Index(['channel_name', 'channel_id', 'view_count', 'category', 'country',
       'defaultLanguage', 'subscriber_count', 'created_date', 'description',
       'custom_url', 'thumbnail', 'video_count', 'videos_last_30_days',
       'views_last_30_videos', 'uploads_playlist_id', 'last_10_video_ids'],
      dtype='object')