# Drake vs. Kendrick Lamar - YouTube
> This notebook downloads, processes and analyzes videos posted to YouTube by rapper Drake and Kendrick Lamar. It also collects statistics, such as views, for each video in an effort to gauge who's winning the diss track battle. 

#### Load Python tools and Jupyter config

In [1]:
import os
import json
import requests
import pandas as pd
import jupyter_black
import altair as alt
from tqdm.notebook import tqdm
from datetime import datetime
import altair_stiles as altstiles
from googleapiclient.discovery import build

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('stiles')

In [3]:
today = pd.Timestamp("today").strftime("%Y%m%d")
now = pd.to_datetime(datetime.utcnow()).tz_localize("US/Eastern")

---

## YouTube

#### Initialize YouTube API

In [4]:
api_key = os.environ.get("YOUTUBE_KEY")
youtube = build("youtube", "v3", developerKey=api_key)

#### Function to get videos from a channel

In [5]:
def get_channel_videos(channel_id):
    # Get the upload playlist ID for the channel
    res = youtube.channels().list(id=channel_id, part="contentDetails").execute()
    playlist_id = res["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]

    videos = []
    next_page_token = None

    # Fetch all videos in the playlist
    while True:
        res = (
            youtube.playlistItems()
            .list(
                playlistId=playlist_id,
                part="snippet",
                maxResults=50,  # Adjusted as needed
                pageToken=next_page_token,
            )
            .execute()
        )
        videos += res["items"]
        next_page_token = res.get("nextPageToken")

        if next_page_token is None:
            break

    return videos

#### Define each artist's YouTube channel ID

In [6]:
# https://www.youtube.com/@kendricklamar
kendrick_videos = get_channel_videos("UC3lBXcrKFnFAFkfVk5WuKcQ")
# https://www.youtube.com/@DrakeOfficial
drake_videos = get_channel_videos("UCByOQJjav0CUDwxCk-jVNRQ")

#### Create dataframe with video lists

In [7]:
video_data = []
for video in kendrick_videos + drake_videos:
    video_info = {
        "title": video["snippet"]["title"],
        "published_at": video["snippet"]["publishedAt"],
        # "description": video["snippet"]["description"],
        "video_id": video["snippet"]["resourceId"]["videoId"],
        "channel": video["snippet"]["channelTitle"],
    }
    video_data.append(video_info)

In [8]:
df_videos = pd.DataFrame(video_data)

In [9]:
d = df_videos.query('channel == "Drake"').reset_index(drop=True)
k = df_videos.query('channel == "Kendrick Lamar"').reset_index(drop=True)

In [10]:
vid_list = df_videos["video_id"].to_list()

In [11]:
def get_video_statistics(video_id, api_key):
    url = f"https://www.googleapis.com/youtube/v3/videos?part=statistics&id={video_id}&key={api_key}"
    response = requests.get(url)
    data = response.json()
    return (
        data["items"][0]["statistics"]
        if "items" in data and len(data["items"]) > 0
        else None
    )

In [12]:
dfs = []

for item in tqdm(vid_list):
    video_id = item
    video_stats = get_video_statistics(video_id, api_key)
    df = pd.DataFrame(video_stats, index=[0]).assign(video_id=item)
    dfs.append(df)

  0%|          | 0/302 [00:00<?, ?it/s]

In [13]:
stats = pd.concat(dfs).reset_index(drop=True)

In [14]:
stats.head()

Unnamed: 0,viewCount,likeCount,favoriteCount,commentCount,video_id
0,24409336,1664927,0,219754,T6eK-2OQtew
1,19205512,1260128,0,148694,2QiFl9Dc7D0
2,17452147,1110894,0,125952,NPqDIwWMtxg
3,24304072,821873,0,14714,5GhhVHpPR_M
4,31062920,649907,0,18158,toBTPGfurLc


---

## Merge

#### Combine video list with statistics

In [15]:
combined_df = pd.merge(df_videos, stats, on="video_id").drop("favoriteCount", axis=1)

In [16]:
combined_df["pub_datetime"] = pd.to_datetime(combined_df["published_at"]).dt.tz_convert(
    "US/Eastern"
)
combined_df["published_at"] = pd.to_datetime(combined_df["published_at"])
combined_df["pub_date"] = combined_df["pub_datetime"].dt.date
combined_df["pub_time"] = combined_df["pub_datetime"].dt.time

#### Calculate the difference in hours

In [17]:
combined_df["hours_since_pub"] = (
    (now - combined_df["pub_datetime"]).dt.total_seconds() / 3600
).round()

In [18]:
combined_df["days_since_pub"] = (
    (now - combined_df["pub_datetime"]).dt.total_seconds() / (3600 * 24)
).round()

In [19]:
val_cols = ["viewCount", "likeCount", "commentCount"]

In [20]:
combined_df = combined_df.dropna(subset=val_cols).copy()

In [21]:
combined_df["views_per_day"] = (
    combined_df["viewCount"].astype(int) / combined_df["days_since_pub"]
).round()
combined_df["likes_per_day"] = (
    combined_df["likeCount"].astype(int) / combined_df["days_since_pub"]
).round()
combined_df["comments_per_day"] = (
    combined_df["commentCount"].astype(int) / combined_df["days_since_pub"]
).round()

In [22]:
df = (
    combined_df.sort_values("views_per_day", ascending=False)
    .drop(["published_at", "pub_datetime"], axis=1)
    .reset_index(drop=True)
)

In [23]:
df[["viewCount", "likeCount", "commentCount"]] = df[
    ["viewCount", "likeCount", "commentCount"]
].astype(int)

In [28]:
alt.Chart(
    df[
        [
            "title",
            "channel",
            "viewCount",
            "likeCount",
            "commentCount",
            "views_per_day",
            "comments_per_day",
            "days_since_pub",
        ]
    ],
    padding={"left": 5},
).mark_circle().encode(
    x=alt.X(
        "views_per_day", title="Views per day", axis=alt.Axis(format="2s", tickCount=10)
    ),
    y=alt.Y(
        "days_since_pub",
        title="Days since published on YouTube",
        axis=alt.Axis(grid=False),
    ).scale(type="log"),
    color="channel",
    size="views_per_day",
    tooltip=["title", "channel", "views_per_day", "days_since_pub"],
).properties(
    width=900, height=500
)

In [25]:
df.head()

Unnamed: 0,title,video_id,channel,viewCount,likeCount,commentCount,pub_date,pub_time,hours_since_pub,days_since_pub,views_per_day,likes_per_day,comments_per_day
0,Not Like Us,T6eK-2OQtew,Kendrick Lamar,24409336,1664927,219754,2024-05-04,19:50:43,80.0,3.0,8136445.0,554976.0,73251.0
1,THE HEART PART 6 - DRAKE,HJeY-FXidDQ,Drake,14047182,653035,180041,2024-05-05,21:19:17,54.0,2.0,7023591.0,326518.0,90020.0
2,meet the grahams,2QiFl9Dc7D0,Kendrick Lamar,19205512,1260128,148694,2024-05-03,23:58:26,100.0,4.0,4801378.0,315032.0,37174.0
3,DRAKE - FAMILY MATTERS,ZkXG3ZrXlbc,Drake,17398174,720349,125432,2024-05-03,23:06:01,100.0,4.0,4349544.0,180087.0,31358.0
4,euphoria,NPqDIwWMtxg,Kendrick Lamar,17452147,1110894,125952,2024-04-30,11:23:30,184.0,8.0,2181518.0,138862.0,15744.0


---

## Exports

#### JSON

In [26]:
df.to_json(
    f"data/processed/drake_vs_kendrick_lamar_youtube.json",
    indent=4,
    orient="records",
)

#### CSV

In [27]:
df.to_csv(f"data/processed/drake_vs_kendrick_lamar_youtube.csv", index=False)