# Youtube Ranking Analysis

In [None]:
import requests
import json
import time
import glob
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import math
from scipy.stats import linregress
import dateutil.parser
import pickle
from scipy import stats
from IPython.display import IFrame

In [None]:
sns.set(style="darkgrid")

In [None]:
api_key = "YOUR YOUTUBE API KEY"

In [None]:
def fetch_videos(query):
    """
    Expects a query string that's passed into the Youtube API.
    Returns a list of Youtube video IDs
    """
    api_params = {
        'q': query,
        'maxResults': 50,
        'key': api_key,
        'part':'id',
        'type':'video',
        'regionCode':'de',
        'relevanceLanguage':'de'
    }
    api_url = 'https://www.googleapis.com/youtube/v3/search'
    r = requests.get(api_url, params=api_params)
    videos = r.json()
    try:
        ret = [item['id']['videoId'] for item in videos['items']]
    except:
        ret = []
        print(videos)
    return ret

In [None]:
video_ids = fetch_videos("smx münchen")

In [None]:
video_ids[0:4]

In [None]:
video_id = 'yayI5-kPTkg'
url = "https://www.youtube.com/embed/"+video_id+"?rel=0&amp;controls=0&amp;showinfo=0"
IFrame(url, width=560, height=315)

In [None]:
def fetch_video_details(video_id):
    """
    Expects a single video id or a list of multiple IDs
    Returns the JSON object
    """
    if type(video_id) is list:
        video_id = ','.join(video_id)
    api_params = {'id': video_id, 'key': api_key, 'part':'snippet,statistics'}
    api_url = 'https://www.googleapis.com/youtube/v3/videos'
    r = requests.get(api_url, params=api_params)
    return r.json()

In [None]:
fetch_video_details('yayI5-kPTkg')

In [None]:
def fetch_and_store(query):
    """
    Fetch videos for `query` and store the JSON results in folder youtube/results/
    """
    video_ids = fetch_videos(query)
    j = fetch_video_details(video_ids)
    filename = "youtube/results/" + query + ".json"
    with open(filename, "w") as json_out:
        json.dump(j, json_out)

In [1]:
queries = ["put your", "queries here"]

In [None]:
# Uncomment to retrieve results from the API
# Not needed for this tutorial
#for q in queries:
#    print(q)
#    fetch_and_store(q)
#    time.sleep(3)

## Read ranking data

In [None]:
data = []
yt_files = glob.glob("youtube/results/*.json")
for yt in yt_files:
    j = json.loads( open(yt, 'r').read() )
    data.extend( [ {
        'query':yt,
        'rank':i+1,
        'id': item['id'],
        'published':item['snippet']['publishedAt'],
        'title':item['snippet']['title'],
        'views':int(item['statistics']['viewCount']),
        'likes':int(item['statistics'].get('likeCount',0)),
        'comments':int(item['statistics'].get('commentCount',0)),
        'tags':item.get('snippet',{}).get('tags','')
    } for i, item in enumerate(j['items'])] )
df = pd.DataFrame(data)

In [None]:
df.sample(n=5)

## views

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="rank", y="views", data=df);

In [None]:
linregress(df['rank'],df['views'])

## Mean and Outliers

In [None]:
def create_mean_data(df, key_col, mean_col, start_index, size, algo='mean', ignore_zero=True, min_values=0):
    """
    key_col     = The key to group the values
    mean_col    = The column with the values to build the mean of
    start_index = The lowest value of key_col
    size        = start_index + size keys are processed
    algo        = Calculate 'mean' or 'median'
    ignore_zero = Sould zero values be ignored?
    min_values  = Only columns with enough data should be considered
    Returns a DF with columns key and mean
    """
    # ranks: a dict of tuples (sum of all views at that position, number of videos) – the key is the rank
    values = {key:[] for key in [i for i in range(start_index, start_index+size)]}
    for index, item in df.iterrows():
        if item[mean_col] > 0 or not ignore_zero:
            if item[key_col] >= start_index and item[key_col] < start_index+size:
                values[item[key_col]].append(item[mean_col])
    views = []
    for position in range(start_index, start_index+size):
        if algo == 'median':
            mean_val = np.median(values[position])
        else:
            mean_val = np.mean(values[position])
        if len(values[position]) < min_values:
            mean_val = 0
        views.append({
            'position': position,
            mean_col: mean_val
        })
    return pd.DataFrame(views)

In [None]:
mean_df = create_mean_data(df, 'rank', 'views', 1, 50, algo="median")
mean_df.head()

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="position", y="views", data=mean_df)

In [None]:
linregress(mean_df['position'],mean_df['views'])

## likes

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="rank", y="likes", data=df);

In [None]:
linregress(df['rank'],df['likes'])

In [None]:
mean_df = create_mean_data(df, 'rank', 'likes', 1, 50, algo="median")
mean_df.head()

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="position", y="likes", data=mean_df)

In [None]:
linregress(mean_df['position'],mean_df['likes'])

## comments

In [None]:
mean_df = create_mean_data(df, 'rank', 'comments', 1, 50, algo="median")
mean_df.head()

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="position", y="comments", data=mean_df)

In [None]:
linregress(mean_df['position'],mean_df['comments'])

## age

In [None]:
def get_date_diff(d1, d2):
    """
    Expects d1, d2 in ISO 8601 format e.g. "2019-03-25T16:00:00Z"
    Returns difference in days (as float)
    """
    return abs(dateutil.parser.parse(d1)-dateutil.parser.parse(d2)).total_seconds()/86400

In [None]:
get_date_diff('2019-03-25T16:00:00Z','2019-03-25T04:00:00Z')

In [None]:
fetch_date = '2019-03-25T16:00:00Z'
data = []
yt_files = glob.glob("youtube/results/*.json")
for yt in yt_files:
    j = json.loads( open(yt, 'r').read() )
    data.extend( [ {
        'rank':i+1,
        'ago':get_date_diff(item['snippet']['publishedAt'],fetch_date),
        'title':item['snippet']['title'],
    } for i, item in enumerate(j['items'])] )
df = pd.DataFrame(data)

In [None]:
df.sample(n=5)

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="rank", y="ago", data=df);

In [None]:
linregress(df['rank'],df['ago'])

In [None]:
mean_df = create_mean_data(df, 'rank', 'ago', 1, 50, algo="mean")
mean_df.head()

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="position", y="ago", data=mean_df)

In [None]:
linregress(mean_df['position'],mean_df['ago'])

## Channels

In [None]:
fetch_date = '2019-03-25T16:00:00Z'
data = []
yt_files = glob.glob("youtube/results/*.json")
for yt in yt_files:
    j = json.loads( open(yt, 'r').read() )
    data.extend( [ {
        'rank':i+1,
        'channel':item['snippet'].get('channelId',''),
        'ago':get_date_diff(item['snippet']['publishedAt'],fetch_date),
        'title':item['snippet']['title'],
        'tags':item.get('snippet',{}).get('tags','')
    } for i, item in enumerate(j['items'])] )
df = pd.DataFrame(data)

In [None]:
df.sample(n=5)

In [None]:
def fetch_channel_details(channel_id):
    """
    Expects a single channel ID or a list of multiple IDs
    Returns the JSON object
    """
    if type(channel_id) is list:
        channel_id = ','.join(channel_id)
    api_params = {'id': channel_id, 'key': api_key, 'part':'snippet,statistics'}
    api_url = 'https://www.googleapis.com/youtube/v3/channels'
    r = requests.get(api_url, params=api_params)
    return r.json()

In [None]:
fetch_channel_details('UClEYc6k6bQTC6wO7YSZ3Y2A')

In [None]:
channels = {}
for index, row in df.iterrows():
    channels[row["channel"]] = {}

In [None]:
# Uncomment to fetch channel data via Youtube API
# Update channel statistics stored in channels
#i = 0
#for ch_id in channels:
#    if channels[ch_id] == {}:
#        print(i, ch_id)
#        j = fetch_channel_details(ch_id)
#        channels[ch_id] = j['items'][0]['statistics']
#        i = i + 1
#        if i >= 300:
#            break
#        time.sleep(2)
#
#with open('youtube/channels.pickle3', 'wb') as f:
#    pickle.dump(channels, f)

In [None]:
with open('youtube/channels.pickle3', 'rb') as f:
    channels = pickle.load(f)

In [None]:
len([k for k in channels if channels[k] != {}])

In [None]:
channels['UClEYc6k6bQTC6wO7YSZ3Y2A']

In [None]:
fetch_date = '2019-03-25T16:00:00Z'
data = []
yt_files = glob.glob("youtube/results/*.json")
for yt in yt_files:
    j = json.loads( open(yt, 'r').read() )
    data.extend( [
        {
         'rank':i+1,
         'channel':item['snippet'].get('channelId',''),
         'subscribers':int(channels.get(item['snippet'].get('channelId',''))['subscriberCount']),
         'views':int(channels.get(item['snippet'].get('channelId',''))['viewCount']),
         'videos':int(channels.get(item['snippet'].get('channelId',''))['videoCount']),
         'title':item['snippet']['title'],
        } for i, item in enumerate(j['items'])] )
df = pd.DataFrame(data)
df.sample(n=5)

### Channel Subscribers

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="rank", y="subscribers", data=df);

In [None]:
linregress(df['rank'],df['subscribers'])

In [None]:
mean_df = create_mean_data(df, 'rank', 'subscribers', 1, 50, algo="median")
mean_df.head()

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="position", y="subscribers", data=mean_df)

In [None]:
linregress(mean_df['position'],mean_df['subscribers'])

### Channel Views

In [None]:
mean_df = create_mean_data(df, 'rank', 'views', 1, 50, algo="median")
mean_df.head()

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="position", y="views", data=mean_df)

In [None]:
linregress(mean_df['position'],mean_df['views'])

## Channel Videos

In [None]:
mean_df = create_mean_data(df, 'rank', 'videos', 1, 50, algo="median")
mean_df.head()

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="position", y="videos", data=mean_df)

In [None]:
linregress(mean_df['position'],mean_df['videos'])

## Title and Tags

In [None]:
def text_score(query, text):
    """
    Calculates a score for the given query according to the given title
    """
    score = 0
    query = query.lower()
    text = text.lower()
    query_words = query.split()
    # Exact match
    if query in text:
        score += 4
    for qw in query_words:
        if qw in text:
            score += 1
    return score

In [None]:
text_score('großer arber', 'Skiing Großer Arber GoPro 2017')

In [None]:
fetch_date = '2019-03-25T16:00:00Z'
data = []
yt_files = glob.glob("youtube/results/*.json")
for yt in yt_files:
    j = json.loads( open(yt, 'r').read() )
    query = yt.replace("youtube/results/","").replace(".json","")
    query_words = yt.replace("youtube/results/","").replace(".json","").split()
    data.extend( [
        {
         'rank':i+1,
         'query':query,
         'title':item['snippet']['title'],
         'titleScore':text_score(query,item['snippet']['title']),
         'tagScore':max([text_score(query,tag) for tag in item['snippet'].get('tags','')], default=0)
        } for i, item in enumerate(j['items'])] )
df = pd.DataFrame(data)
df.sample(n=5)

In [None]:
mean_df = create_mean_data(df, 'rank', 'titleScore', 1, 50, ignore_zero=False)
mean_df.head()

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="position", y="titleScore", data=mean_df)

In [None]:
linregress(mean_df['position'],mean_df['titleScore'])

In [None]:
mean_df = create_mean_data(df, 'rank', 'tagScore', 1, 50, ignore_zero=False)
mean_df.head()

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(x="position", y="tagScore", data=mean_df)

In [None]:
linregress(mean_df['position'],mean_df['tagScore'])

# Summary

In [None]:
summary = pd.DataFrame([
    {
        'name': 'Video Views',
        'correlation': 0.80
    },
    {
        'name': 'Video Likes',
        'correlation': 0.77
    },
    {
        'name': 'Video Kommentare',
        'correlation': 0.55
    },
    {
        'name': 'Channel Views',
        'correlation': 0.49
    },
    {
        'name': 'Video Alter',
        'correlation': 0.35
    },
    {
        'name': 'Channel Subscriber',
        'correlation': 0.31
    },
    {
        'name': 'Channel Anz. Videos',
        'correlation': 0.33
    },
    {
        'name': 'Video Titel Score',
        'correlation': -0.10
    },
    {
        'name': 'Video Tag Score',
        'correlation': -0.11
    },
])

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(x="correlation", y="name", data=summary, palette="Blues_d")