# Youtube Trend Analysis
##### Team Members: Simon, Frank and David

In [1]:
%matplotlib notebook
%matplotlib inline

In [2]:
# Load packages
from matplotlib import pyplot as plt
from scipy.stats import linregress
import numpy as np
from sklearn import datasets
import pandas as pd
import functools as ft
import seaborn as sns
import re  
from scipy.stats import logistic
import calendar
import random
import json

In [3]:
# Load data 
youtube_data = pd.read_csv('YouTube_Data/US_youtube_trending_data.csv')
lookup = pd.read_csv('YouTube_Data/category_ids.csv')

# Data Cleaning and Exploration
#### Pandas Process

* Parse youtube date format into datetime format
* Calculate lag time for each video to trend
* Replace category ID integers with category names based on youtube category dictionary
* Sort by ‘like’, most to least
* Drop duplicates
* Groupby date, calculate number of trending videos published
* Groupby category calculate number, like %, dislike %
* Groupby month, see if there are trending difference month-to-month (there are not)
* Analyze the lag time for a video to trend

In [4]:
# Function to parse dates (there is probably a built-in function, but this is what I was able to figure out)

def youtube_date_parse(df, series_to_parse, new_name_date, new_name_time, delimiting_character, drop_character):
    try:
        split = df[series_to_parse].str.split(delimiting_character, n = 1, expand = True)
        split[1] =split[1].map(lambda x: x.rstrip(drop_character))
        df[new_name_date] = split[0]
        df[new_name_time] = split[1]
        df.drop(columns = [series_to_parse], inplace=True)
        df[new_name_date] = pd.to_datetime(df[new_name_date])
        return df
    except:
        print('Already parsed, dumbass...')

In [5]:
########### Clean up data: drop columns, parse dates, replace category numbers with cateogry names, etc. #####

# Drop unneeded columns
youtube_data.drop(['video_id','channelId', 'thumbnail_link', 'comments_disabled', 'ratings_disabled'],\
                  axis=1, inplace = True)

# Convert date strings to datetime objects
youtube_date_parse(youtube_data, 'publishedAt', 'date_published', 'time_published', 'T', 'Z')
youtube_date_parse(youtube_data, 'trending_date', 'date_trending', 'time_trending', 'T', 'Z')
youtube_data.drop(['time_trending'], axis=1, inplace=True)

# The datetime functions are great, and can be used to group (see below).
# To make life easier for folks, I'm going to put month names in their own column
# notice the .dt.month syntax that works on datetime objects
youtube_data['month_published']= youtube_data['date_published'].dt.month
youtube_data['month_trending'] = youtube_data['date_trending'].dt.month
youtube_data['month_published'].replace([8,9,10,11], ['Aug', 'Sept', 'Oct', 'Nov'], inplace=True)
youtube_data['month_trending'].replace([8,9,10,11], ['Aug', 'Sept', 'Oct', 'Nov'], inplace=True)

# Calculate lag time between posting and trending.  You can do arithmetic with dates in datetime
youtube_data['lag'] = youtube_data['date_trending'] - youtube_data['date_published']

# Covert category IDs to catetory names
youtube_data['categoryId'].replace(lookup['categoryId'].tolist(), lookup['category'].tolist(), inplace=True)

# Rename columns
new_names = {'channelTitle':'channel', 'categoryId':'category', 'view_count':'views', 'comment_count':'comments'}
youtube_data.rename(columns = new_names, inplace=True)

# Sort by likes
youtube_data.sort_values('likes', ascending=False, inplace=True)


# Reorder columns for readability
new_order = ['channel', 'title', 'category', 'views', 'likes','dislikes','comments', 'date_published', 'date_trending','lag','month_published', 'month_trending', 'time_published','tags', 'description']
youtube_data=youtube_data[new_order]

# Get column names so they are handy
cols = youtube_data.columns

# Keep only the most popular posting of duplicate videos
# I could maybe try to roll the stats for duplicates together,
# but this is tricky, in that they will have different dates.
youtube_data = youtube_data.drop_duplicates(subset='title', keep='first')
youtube_data.head()



Unnamed: 0,channel,title,category,views,likes,dislikes,comments,date_published,date_trending,lag,month_published,month_trending,time_published,tags,description
3358,Big Hit Labels,BTS (방탄소년단) 'Dynamite' Official MV,Music,232649205,15735533,714194,6065230,2020-08-21,2020-08-28,7 days,Aug,Aug,03:58:10,BIGHIT|빅히트|방탄소년단|BTS|BANGTAN|방탄,BTS (방탄소년단) 'Dynamite' Official MVCredits:Dire...
4980,BLACKPINK,BLACKPINK - 'Ice Cream (with Selena Gomez)' M/V,Music,184778248,11795670,879354,2735997,2020-08-28,2020-09-05,8 days,Aug,Sept,04:00:11,YG Entertainment|YG|와이지|K-pop|BLACKPINK|블랙핑크|블...,BLACKPINK - ‘Ice Cream (with Selena Gomez)’Com...
11764,BLACKPINK,BLACKPINK – ‘Lovesick Girls’ M/V,Music,140685439,9217876,127308,1507605,2020-10-02,2020-10-09,7 days,Oct,Oct,04:00:13,YG Entertainment|YG|와이지|K-pop|BLACKPINK|블랙핑크|블...,BLACKPINK – ‘Lovesick Girls’영원한 밤창문 없는 방에 우릴 가...
2762,Big Hit Labels,BTS (방탄소년단) 'Dynamite' Official Teaser,Music,62496726,6178664,158845,992356,2020-08-18,2020-08-25,7 days,Aug,Aug,15:00:02,BIGHIT|빅히트|방탄소년단|BTS|BANGTAN|방탄,BTS (방탄소년단) 'Dynamite' Official TeaserBTS (방탄소...
3992,Big Hit Labels,BTS (방탄소년단) 'Dynamite' Official MV (B-side),Music,45596902,5951286,97683,382374,2020-08-24,2020-08-31,7 days,Aug,Aug,15:00:01,BIGHIT|빅히트|방탄소년단|BTS|BANGTAN|방탄,BTS (방탄소년단) 'Dynamite' Official MV (B-side)Cre...


In [11]:
# See how many videos were published each day in the dataset
# Remember, the youtube data set is a subset of of trending videos in the US, not all videos
pub_by_date = youtube_data[['title', 'date_published']].groupby('date_published')\
.count().rename(columns={'title':'number_published'})
pub_by_date

Unnamed: 0_level_0,number_published
date_published,Unnamed: 1_level_1
2020-08-03,1
2020-08-05,4
2020-08-06,22
2020-08-07,34
2020-08-08,29
...,...
2020-10-31,32
2020-11-01,27
2020-11-02,37
2020-11-03,10


In [10]:
# And plot the number published each day
# I could see if there are trends by days of the week
plt.style.use('ggplot')
plt.plot(pub_by_date['number_published'])
plt.xticks(rotation=75)

NameError: name 'pub_by_date' is not defined

In [9]:
# Categories looks at total published, likes, and dislikes.
# Only the news and politics have significant dislikes


In [8]:
# Plot number of vids in each category
# Someone could add the x labels.  I'm bad at it.
 


In [6]:
# See if there are any trends by month.
# There are not.



In [7]:
#See how long it takes a video to trend.
# We have a single outlier at 30 days, which we could chop off in subsequent work.


In [None]:
# Produce the most popular videos for a user





Month for most liked videos (Aug, Sept, Oct, Nov)?aug
How many videos to see?3358
Most popular videos:
Series([], Name: title, dtype: object)


## Plotting Process with Matplotlib/Pandas:
* Using the previously created PANDAS dataframe, created additional layers to parse specific data per month,
* This included trending categories and user statistics.
* Added supplemental grouby functions to organize this data. 
* Create bar and scatter graphs using pyplot. 
* The heatmap was created with a correlation function and seaborn.

In [None]:
#Coorelation between youtube scrubbed dat


In [None]:
#Coorelation heatmap using Seaborn



In [None]:
#Extract Channel Views by Trending Month


In [None]:
#Classify Top Trending Channels by Month for August


In [None]:
#Classify Top Trending Channels by Month for September


In [None]:
#Classify Top Trending Channels by Month for October


In [None]:
#Classify Top Trending Channels by Month for November


In [None]:
#Graph August


In [None]:
#Graph September



In [None]:
#Graph October



In [None]:
#Graph November



In [None]:
#Prep
#create columns of percentage of likes and dislikes


In [None]:
#Prep
#df of the top 100


In [None]:
#Scatter Plots of likes and dislikes on a date from all four months based on views
#not 100% convinced scatter plots work for this
#August starts on the 13th, full month of Sept and Oct, Nov is only 1-5th

#input_date = input(f'Enter a date to see what was trending that day example month/day/year')
#date=df[df['date_trending']==input_date]

#subplots initilizer


#supplot 1
#initialize

#filter by specific day

#subplot (rows, columns, locate)


#subplot 2
#initialize

#filter by specific day

#subplot (rows, columns, locate)



#subplot 3
#initialize

#filter by specific day

#subplot (rows, columns, locate)


#subplot 4
#initialize

#filter by specific day

#subplot (rows, columns, locate)
  



In [None]:
#scatter plots likes, dislikes and comments per category based on views
#top 8 categorys
#subplots initilizer
 


#subplot 2
 


#subplot 3
#initialize

#subplot 4
#initialize



#subplot 5
#initialize


#subplot 6
#initialize



#subplot 7
#initialize
 


#subplot 8
#initialize
 




In [None]:
#Discord analsys: Shows reactions based on percet likes/dislikes if key term found in titles 
#subplots initilizer


In [None]:
#User input key term search to see percentage likes and dislikes for a key term
#subplots initilizer


In [None]:
#subplots initilizer


In [None]:
#Scatter of reactions of top 100 views and percent of likes and dislikes
#subplots initilizer


In [None]:
#export clean data


# EXPLORING YOUTUBE API:
* Collect data sample from YouTube API that contains top 50 most viewed YouTube videos in a month of 2019 and 2020.
* Also specifically look for top 50 Quarantine Challenges by view count during strict COVID Lockdown in US.



In [None]:
# Dependencies


# API Exploration Process:
#### There are three kinds of data searches used: 
  * YouTube search, items with snippet descriptions
  * YouTube video, to get video statistics
  * youtube channel, to get channel statistic
  * all the above three files all linked by the unique video_id and channel_id field. 

### Relevant videos on youtube in April 2019:

In [None]:
#lookthrough youtube api and get top 50 relevant videos published between in April 2019


# Data Cleaning Process:
   #### Once we pulled the necessary data:
   * Stored the API data collected to a local folder so no need to run API everytime
   * Create dataframs 
   * Merge data sets based on the video ID and channel ID fields. 
   * Sorted  the data based on view count, number of likes and dislikes. 
   * Rename and rearrange for better readablility
   * Drop irrelevant columns

In [None]:
#


### Relevant videos on youtube in April 2020:

In [None]:
#lookthrough youtube api and get top 50 relevant videos published in April 2020



# Data Cleaning:
* Create dataframs 
* Rename and rearrange for better readablility
* Mearge and drop irrelevant columns

In [None]:
#df_video_ids, unpacks the dictionary stored in "id" and creats a data frame


In [None]:
#create df


In [None]:
comment_words = '' 
stopwords = set(STOPWORDS) 


# Top 50 Quarantine Challenges:
* During strict lockdown(03/15/2020 through 06/15/2020)

In [None]:
#youtube, looking through youtube api


# Data Cleaning:
* Create dataframs 
* Rename and rearrange for better readablility
* Drop irrelevant columns

In [None]:
#df_video_ids, unpacks the dictionary stored in "id" and creats a data frame


In [None]:
#creat a df


In [None]:
top_viewed = top10_qc
# Top quarantine challenge videos by view count


In [None]:
#create wordcloud 


In [None]:
#export clean data
