In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import collections
from sklearn import preprocessing
import spacy

In [None]:
hololive_us = pd.read_csv('hololive_us.csv', header=1)
hololive_katakana_jp = pd.read_csv('hololive_katakana_jp.csv', header=1)
hololive_us.info()
hololive_us.head()

In [None]:
def clean_google_trends(df):
    
    """
    Turns 0's from strings to ints and
    <1's to 1 (int)
    """
    
    df = df.replace('<1', 1)
    df = df.replace('0', 0)
    df['Week'] = pd.to_datetime(df['Week'], format='%Y-%m-%d')
    return df

clean_us = clean_google_trends(hololive_us)
clean_jp = clean_google_trends(hololive_katakana_jp)

In [None]:
google_plot = plt.plot(clean_us['Week'], clean_us['hololive: (United States)'], color='red', label='US')
google_plot = plt.plot(clean_us['Week'], clean_jp['ホロライブ: (Japan)'], color='skyblue', label='JP (katakana)')
plt.legend(loc='upper left')
plt.title('Hololive US & Japan Youtube Popularity (Japanese katakana)')
plt.show()

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

def scrape_playlist(playlist_link, driver_location, file_name):
    
    """
    Script to scrape titles and account names from a youtube playlist link using selenium
    Saves to csv.
    
    Must use msedge driver from selenium and input local file pathway as driver_location
    """
    
    driver = webdriver.Edge(driver_location)
    driver.get(playlist_link)
    driver.maximize_window()
    body = driver.find_element_by_css_selector('body')
    
    ## Get playlist length to determine how many times need to scroll before getting to bottom of page
    ## Approx. 100 videos per load
    video_count = int(driver.find_element_by_xpath('//*[@id="stats"]/yt-formatted-string[1]/span[1]'))
    scrolls = int((video_count + 100)/100)
    
    ## Scroll to bottom of page
    for i in range(scrolls):
        body.send_keys(Keys.END)
        time.sleep(3)
        print(f'[{i/scrolls}] Loading...')
    
    ## Scrape titles, channels, thumbnails
    print('Getting titles...')
    titles = driver.find_elements_by_id('video-title')
    print('Getting accounts...')
    channels = driver.find_elements_by_xpath('//*[@id="text"]/a')
    channels.pop(0) #
    print('Getting thumbnails')
    thumbnails = driver.find_elements_by_id('img')
    
    df = pd.DataFrame([i.text for i in titles], [i.text for i in channels], [i.get_attrivute('src') for i in thumbnails])
    df.to_csv(f'{file_name}.csv')
    print(f'{file_name}.csv saved')
    
    return df

In [None]:
df = pd.read_csv('vtuber_playlist.csv', index_col = 0)
df.head()

In [None]:
grp_by = df[['account', 'title']].groupby('account').count()
grp_by_sorted = grp_by.sort_values('title', ascending=False)
grp_by_sorted.head()

In [None]:
grp_by_selected = grp_by_sorted.iloc[1:, :]
sns.displot(data=grp_by_selected, binwidth=1, stat='percent')
plt.title('Count of Repeated Accounts Distribution')

In [None]:
df['title char count'] = [len(i) for i in df['title']]
df['account char count'] = [len(i) for i in df['account']]
df.head()

In [None]:
sns.set_style('darkgrid')
sns.displot(data=df['title char count'], color='skyblue', binwidth=5)
plt.axvline(x=df['title char count'].mean(),
            color='red')
sns.displot(data=df['account char count'], color='coral')
plt.axvline(x=df['account char count'].mean(),
           color='red')

In [None]:
title_char = df['title char count']
account_char = df['account char count']
title_char_norm = preprocessing.normalize([title_char], norm='max')
account_char_norm = preprocessing.normalize([account_char], norm='max')
norm = np.array([title_char_norm[0], account_char_norm[0]]).T
char_norm = pd.DataFrame(norm, columns=['title', 'account'])
char_norm.head()

In [None]:
sns.set_style('darkgrid')
sns.displot(data=char_norm['title'], color='skyblue')
sns.displot(data=char_norm['account'], color='coral')

In [None]:
sns.relplot(data=df, x='title char count', y='account char count')

In [None]:
def clean_text(text):
    """
    Function to clean an individual string
    """
    
    return text.replace('【', '').replace('】', '').replace('[', ' ').replace(']', ' ').replace('/' , ' and ').lower()

def filter_text(text):
    """
    Function to get rid of stopwords in a list of words
    """
    
    return [i for i in text if i not in stopwords.words('english')]

def filter_clean_text(text):
    """
    Combine text cleaning and filtering functions
    """
    
    x = text.replace('【', ' ').replace('】', ' ').replace('[', ' ').replace(']', ' ').replace('/' , ' and ')
    x = text.replace('eng', '').replace('english', '').replace('hololive', '').lower()
    return [i for i in x.split() if i not in stopwords.words('english')]

title_text_cleaned = clean_text(' '.join(df['title'])).split()
title_text_filtered = filter_text(title_text_cleaned)

In [None]:
common_stopped = collections.Counter(title_text_filtered).most_common(10)
common_before = collections.Counter(title_text_cleaned).most_common(10)
common_before, common_stopped

In [None]:
[filter_clean_text(sentence) for sentence in df['title']]

In [None]:
titles = df['title']
asdf = collections.Counter(titles[2].split())
grp_by = df[['account', 'title']].groupby('account').count().sort_values('title', ascending=False)

' '.join(df['title']).split()