## COMP-248: Project in Data Science
### Author: Tamanda Mdyanyama

In [1]:
# The Objective of this project is to start and finish a data science project. From Data Collection to Analysis.
# This project will analyze youtube channels and videos in the art niche to identify what makes a successful video.

In [2]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## I. Data Collection

In [3]:
# Setting up
from googleapiclient.discovery import build
from IPython.display import JSON     #to neatly display the json response
api_key = 'AIzaSyDWY165icGdbOeyrLHwXTCr3zXEOESQciU'
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey = api_key
)

In [4]:
# Making a request

# Function to get channel statistics
def get_channel_stats(youtube, channel_ids):

    all_data = [] #list to capture all data from the selected channels
               
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics", 
              id = ','.join(channel_ids)
     )

    response = request.execute()

    # loop through the items to extract what we need
    for item in response['items']:
        data = {'channelName': item['snippet']['title'],
               'subscribers': item['statistics']['subscriberCount'],
               'totalViews': item['statistics']['viewCount'],
                'totalVideos': item['statistics']['videoCount'],
                'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
               }
        all_data.append(data)

    return(pd.DataFrame(all_data))

In [5]:
channel_ids = ['UCFDxyA1H3VEN0VQwfMe2VMQ', #GawxArt
               'UCQ6cvTxLbPbGHtT5mo7PClA', #doodlelifebyTammy
               'UCVlbtV-0IzNltDFmSsRxbrQ', #JoshArt
               'UCQuRK-VFyCjf3VCqZkCDveA', #Uncomfy
               'UCnGP1UqWcHDuA8XGkRPB8mQ', #Fructus Illustrations
               'UC2nUVVyQcQP0Y_DyyVTltng', #brokendraw
               'UCNNOvB507MRfny7Jcv8MmOw', #SamDoesArts
               'UCDivkE8Ckgk14fu1XChEp7w', #linh truong
               'UCQ9XXjX0eNReH28Ass8E-hw', #Apple Cheeks
               'UCLQ2ZDS28QVRLjPaaMEtgOw', #Brett Park
              ]

# get channel statistics
channel_stats = get_channel_stats(youtube, channel_ids)
channel_stats

Unnamed: 0,channelName,subscribers,totalViews,totalVideos,playlistId
0,Apple Cheeks,222000,16775213,219,UUQ9XXjX0eNReH28Ass8E-hw
1,Brett Park,371000,228490154,510,UULQ2ZDS28QVRLjPaaMEtgOw
2,SamDoesArts,1750000,156509558,251,UUNNOvB507MRfny7Jcv8MmOw
3,Fructus Illustrations,75100,3365203,92,UUnGP1UqWcHDuA8XGkRPB8mQ
4,doodlelifebyTammy,598,129328,119,UUQ6cvTxLbPbGHtT5mo7PClA
5,Gawx Art,2440000,140470282,132,UUFDxyA1H3VEN0VQwfMe2VMQ
6,brokendraw,56500,1380059,22,UU2nUVVyQcQP0Y_DyyVTltng
7,Josh Art,694000,42097114,161,UUVlbtV-0IzNltDFmSsRxbrQ
8,Uncomfy,526000,49165305,205,UUQuRK-VFyCjf3VCqZkCDveA
9,linh truong,1240000,69001674,194,UUDivkE8Ckgk14fu1XChEp7w


In [6]:
# Function to get video statistics for a channel
def get_video_ids(youtube, playlist_id):
    video_ids = [] # list to store the video ids from the playlist
    
    request = youtube.playlistItems().list(
        part= "snippet,contentDetails",
        playlistId = playlist_id,
        maxResults = 50
        )
    response = request.execute()
    
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])

    return video_ids

In [7]:
playlist_ids = channel_stats["playlistId"].to_list()
playlist_ids

['UUQ9XXjX0eNReH28Ass8E-hw',
 'UULQ2ZDS28QVRLjPaaMEtgOw',
 'UUNNOvB507MRfny7Jcv8MmOw',
 'UUnGP1UqWcHDuA8XGkRPB8mQ',
 'UUQ6cvTxLbPbGHtT5mo7PClA',
 'UUFDxyA1H3VEN0VQwfMe2VMQ',
 'UU2nUVVyQcQP0Y_DyyVTltng',
 'UUVlbtV-0IzNltDFmSsRxbrQ',
 'UUQuRK-VFyCjf3VCqZkCDveA',
 'UUDivkE8Ckgk14fu1XChEp7w']

In [8]:
allvideo_ids = [] #list to store all the video ids for each channel
for playlist_id in playlist_ids:
    video_ids = get_video_ids(youtube, playlist_id)
    allvideo_ids.append(video_ids)

In [9]:
# to get the video ids of 50 videos from each channel
#allvideo_df = pd.DataFrame(allvideo_ids)
allvideo_ids = list(np.concatenate(allvideo_ids))
allvideo_ids

['ieE5Mt3qF5Q',
 'Z7n1VQjWZOY',
 'ZVgNPKu4S3g',
 'T1Fs1x99qXM',
 'cCl64M45eaQ',
 'FCVoD3Xq41c',
 'EvFGcdGk1yc',
 '470NaMmNkUM',
 'JAYbbr_Eg_0',
 'KwuD8FNSN8U',
 'L5KCV3YIfoQ',
 'ZA3GBcDrTyc',
 'qwR045e5Rds',
 'NmlKBFUTfoo',
 '2uc_Dr3XBNk',
 'xl6Q5N6owaw',
 '9VOzKjv94Ps',
 '6ZbYIIku8Qw',
 '7yndCF3NG5Y',
 'WiOyRRPSAS8',
 '0mImNUCQ3DU',
 'AB87IbHOljI',
 'cpWRexIDxR8',
 'XTL8gdMcUIY',
 'xVSyy0dKgFk',
 'qDkFkILYhsw',
 'cWN9SFRW1kg',
 'vdGdo5V7Xro',
 'gCb8quJtpj8',
 'RPSG59M8MIE',
 '3xe9SvXrGIs',
 'IMXjeFBP_yU',
 'OQlZ8yVmjdY',
 'Q9oSgOGRLYg',
 'AyGxwbKurnA',
 'OO6lPk7lcU8',
 '6nd6Uzrqw10',
 '2oMIldQvAa0',
 'guqmAo_qY4Y',
 '_JGK64wvsdk',
 'RTZ3WkWUA8M',
 '2haxGl76MmE',
 '-_tySV4lmUI',
 'HQORiWCWlHY',
 'E2FREPQpIqc',
 'PIbORjQ0kFM',
 '0a6wYhRUbqg',
 'l2VWmbYYL08',
 'Ast1UBlBBtE',
 'AEwbs7_u8Nw',
 'Vw__dKGj6KQ',
 'vFysH070rBw',
 'H0r2x4eMq_U',
 '1K80EqGuI2A',
 'SvwGPLYQ_34',
 '3n5rHdq23Cg',
 'jhZ9vXvrVuQ',
 'XbYXT5FHHW8',
 'XbjXbx_GkmY',
 'tl8J51cKo6g',
 'n-KCY1WnyV8',
 'cjlkBdvIWKE',
 'B8pOt2

In [10]:
def get_video_details(youtube, video_ids):
    
    all_video_info = []
    
    for i in range(0, len(video_ids),50):
        request = youtube.videos().list(
            part= "snippet,contentDetails,statistics",
            id = ','.join(video_ids[i:i+50])
        )
        response = request.execute()
    
        for video in response['items']:
            stats_to_keep = {'snippet':['channelTitle','title','description','tags','categoryId','publishedAt'],
                            'statistics':['viewCount','likeCount','commentCount'],
                            'contentDetails':['duration'],
                            }
            video_info = {} #dictionary for all the video data
            video_info['video_id']= video['id']
        
            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v]= video[k][v]
                    except:
                        video_info[v]= None
    
            all_video_info.append(video_info)

    return pd.DataFrame(all_video_info)

In [11]:
videos_df = get_video_details(youtube, allvideo_ids)
videos_df

Unnamed: 0,video_id,channelTitle,title,description,tags,categoryId,publishedAt,viewCount,likeCount,commentCount,duration
0,ieE5Mt3qF5Q,Apple Cheeks,Last Days In My Office 🎀 5 years running my sm...,"hello~✸\nAs I prepare for my move, I wanted to...","[studio vlog, apple cheeks, tiffany tan, small...",22,2025-04-27T14:08:47Z,39380,2161,85,PT34M43S
1,Z7n1VQjWZOY,Apple Cheeks,Maybe you’ll make better art if you limit your...,,,22,2025-04-14T16:13:40Z,16924,1561,8,PT1M1S
2,ZVgNPKu4S3g,Apple Cheeks,Remember to Play in Your Sketchbook! #sketchbook,,,22,2025-04-09T17:24:58Z,15523,1340,13,PT1M4S
3,T1Fs1x99qXM,Apple Cheeks,"Solo Trip to SF 🌉🌼 visiting the MOMA, work tri...",Head to https://squarespace.com/applecheeks to...,"[artist diaries, artist vlog, san francisco, t...",22,2025-04-08T16:02:18Z,25142,1707,101,PT28M22S
4,cCl64M45eaQ,Apple Cheeks,Why you should draw something more than once #...,,,22,2025-03-31T16:08:11Z,19093,1198,12,PT1M9S
...,...,...,...,...,...,...,...,...,...,...,...
468,Dy06oEg-70E,linh truong,"solo traveling in japan vlog 🎏 tokyo cafes, sh...",thank you so much to All Nippon Airways for cr...,"[linh truong japan, linh truong vlog, linh tru...",19,2024-05-25T22:30:12Z,294609,13470,271,PT26M8S
469,mo3jJCasYnQ,linh truong,my BIGGEST nightmare as a commuter…,,,26,2024-05-20T13:52:58Z,33335,3087,20,PT29S
470,Sg7SWtwkAI8,linh truong,COLLEGE STUDENTS! this is your chance to win u...,,,26,2024-05-15T20:42:30Z,250562,2621,8,PT58S
471,lJCOOQZvba4,linh truong,"the nikon coolpix sq, aka the coolest digi cam...",,,26,2024-04-27T18:49:32Z,85486,12522,36,PT29S


## II. Data Preprocessing and Cleaning

### Channel_stats cleaning

In [12]:
# check for types
channel_stats.dtypes

channelName    object
subscribers    object
totalViews     object
totalVideos    object
playlistId     object
dtype: object

In [13]:
# make sure they are correct
channel_stats['subscribers']= pd.to_numeric(channel_stats['subscribers'])
channel_stats['totalViews']= pd.to_numeric(channel_stats['totalViews'])
channel_stats['totalVideos']= pd.to_numeric(channel_stats['totalVideos'])
channel_stats.dtypes

channelName    object
subscribers     int64
totalViews      int64
totalVideos     int64
playlistId     object
dtype: object

In [14]:
# Check for null values
channel_stats.isnull().any()

channelName    False
subscribers    False
totalViews     False
totalVideos    False
playlistId     False
dtype: bool

In [15]:
# You could make new coloumns (what columns would be useful to have here?)

### videos_df cleaning

In [16]:
# Check if null values exist
videos_df.isnull().any()

video_id        False
channelTitle    False
title           False
description     False
tags             True
categoryId      False
publishedAt     False
viewCount       False
likeCount        True
commentCount    False
duration        False
dtype: bool

In [17]:
videos_df.dtypes

video_id        object
channelTitle    object
title           object
description     object
tags            object
categoryId      object
publishedAt     object
viewCount       object
likeCount       object
commentCount    object
duration        object
dtype: object

In [18]:
videos_df['viewCount']= pd.to_numeric(videos_df['viewCount'])
videos_df['likeCount']= pd.to_numeric(videos_df['likeCount'])
videos_df['commentCount']= pd.to_numeric(videos_df['commentCount'])

In [19]:
# format duration column using regular expressions
import re

def duration_in_sec(duration):
    # Regular expressions to extract hours, minutes, and seconds
    hour_match = re.search(r'(\d+)H', duration)
    minute_match = re.search(r'(\d+)M', duration)
    second_match = re.search(r'(\d+)S', duration)
    
    # Extract the values or default to 0 if not present
    hours = int(hour_match.group(1)) if hour_match else 0
    minutes = int(minute_match.group(1)) if minute_match else 0
    seconds = int(second_match.group(1)) if second_match else 0
    
   # Calculate total seconds as float
    return float(hours * 3600 + minutes * 60 + seconds)

In [20]:
videos_df['duration_in_sec']= videos_df['duration'].apply(lambda x: duration_in_sec(x))
videos_df[['duration_in_sec','duration']]

Unnamed: 0,duration_in_sec,duration
0,2083.0,PT34M43S
1,61.0,PT1M1S
2,64.0,PT1M4S
3,1702.0,PT28M22S
4,69.0,PT1M9S
...,...,...
468,1568.0,PT26M8S
469,29.0,PT29S
470,58.0,PT58S
471,29.0,PT29S


In [21]:
# format publishedAt date
!pip install python-dateutil
from dateutil import parser
videos_df['publishedAt']= videos_df['publishedAt'].apply(lambda x: parser.parse(x))
videos_df['publishDay']= videos_df['publishedAt'].apply(lambda x:x.strftime("%A"))



In [22]:
videos_df['publishDay']

0         Sunday
1         Monday
2      Wednesday
3        Tuesday
4         Monday
         ...    
468     Saturday
469       Monday
470    Wednesday
471     Saturday
472       Monday
Name: publishDay, Length: 473, dtype: object

In [23]:
# add tag count
videos_df['tagCount']= videos_df['tags'].apply(lambda x: 0 if x is None else len(x))
videos_df['tagCount']

0      10
1       0
2       0
3      26
4       0
       ..
468     4
469     0
470     0
471     0
472     0
Name: tagCount, Length: 473, dtype: int64

In [24]:
videos_df

Unnamed: 0,video_id,channelTitle,title,description,tags,categoryId,publishedAt,viewCount,likeCount,commentCount,duration,duration_in_sec,publishDay,tagCount
0,ieE5Mt3qF5Q,Apple Cheeks,Last Days In My Office 🎀 5 years running my sm...,"hello~✸\nAs I prepare for my move, I wanted to...","[studio vlog, apple cheeks, tiffany tan, small...",22,2025-04-27 14:08:47+00:00,39380,2161.0,85,PT34M43S,2083.0,Sunday,10
1,Z7n1VQjWZOY,Apple Cheeks,Maybe you’ll make better art if you limit your...,,,22,2025-04-14 16:13:40+00:00,16924,1561.0,8,PT1M1S,61.0,Monday,0
2,ZVgNPKu4S3g,Apple Cheeks,Remember to Play in Your Sketchbook! #sketchbook,,,22,2025-04-09 17:24:58+00:00,15523,1340.0,13,PT1M4S,64.0,Wednesday,0
3,T1Fs1x99qXM,Apple Cheeks,"Solo Trip to SF 🌉🌼 visiting the MOMA, work tri...",Head to https://squarespace.com/applecheeks to...,"[artist diaries, artist vlog, san francisco, t...",22,2025-04-08 16:02:18+00:00,25142,1707.0,101,PT28M22S,1702.0,Tuesday,26
4,cCl64M45eaQ,Apple Cheeks,Why you should draw something more than once #...,,,22,2025-03-31 16:08:11+00:00,19093,1198.0,12,PT1M9S,69.0,Monday,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468,Dy06oEg-70E,linh truong,"solo traveling in japan vlog 🎏 tokyo cafes, sh...",thank you so much to All Nippon Airways for cr...,"[linh truong japan, linh truong vlog, linh tru...",19,2024-05-25 22:30:12+00:00,294609,13470.0,271,PT26M8S,1568.0,Saturday,4
469,mo3jJCasYnQ,linh truong,my BIGGEST nightmare as a commuter…,,,26,2024-05-20 13:52:58+00:00,33335,3087.0,20,PT29S,29.0,Monday,0
470,Sg7SWtwkAI8,linh truong,COLLEGE STUDENTS! this is your chance to win u...,,,26,2024-05-15 20:42:30+00:00,250562,2621.0,8,PT58S,58.0,Wednesday,0
471,lJCOOQZvba4,linh truong,"the nikon coolpix sq, aka the coolest digi cam...",,,26,2024-04-27 18:49:32+00:00,85486,12522.0,36,PT29S,29.0,Saturday,0


In [25]:
channel_stats

Unnamed: 0,channelName,subscribers,totalViews,totalVideos,playlistId
0,Apple Cheeks,222000,16775213,219,UUQ9XXjX0eNReH28Ass8E-hw
1,Brett Park,371000,228490154,510,UULQ2ZDS28QVRLjPaaMEtgOw
2,SamDoesArts,1750000,156509558,251,UUNNOvB507MRfny7Jcv8MmOw
3,Fructus Illustrations,75100,3365203,92,UUnGP1UqWcHDuA8XGkRPB8mQ
4,doodlelifebyTammy,598,129328,119,UUQ6cvTxLbPbGHtT5mo7PClA
5,Gawx Art,2440000,140470282,132,UUFDxyA1H3VEN0VQwfMe2VMQ
6,brokendraw,56500,1380059,22,UU2nUVVyQcQP0Y_DyyVTltng
7,Josh Art,694000,42097114,161,UUVlbtV-0IzNltDFmSsRxbrQ
8,Uncomfy,526000,49165305,205,UUQuRK-VFyCjf3VCqZkCDveA
9,linh truong,1240000,69001674,194,UUDivkE8Ckgk14fu1XChEp7w


# Exporting the DataSets

In [26]:
videos_df.to_csv('videos_dataset.csv',index=False)
channel_stats.to_csv('channel_dataset.csv', index = True)