In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from wordcloud import WordCloud

import spacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

In [4]:
df = pd.read_parquet('/home/fagabby/working/YoutubeProject/project/db/01-2022_10-2022.parquet')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2376 entries, 0 to 2375
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   videoId       2376 non-null   object 
 1   categoryId    2376 non-null   object 
 2   category      2376 non-null   object 
 3   title         2376 non-null   object 
 4   viewCount     2375 non-null   float64
 5   likeCount     2342 non-null   float64
 6   commentCount  2336 non-null   float64
 7   publishedAt   2376 non-null   object 
 8   channelId     2376 non-null   object 
 9   description   2376 non-null   object 
 10  channelTitle  2376 non-null   object 
 11  regionCode    2376 non-null   object 
dtypes: float64(3), object(9)
memory usage: 241.3+ KB


In [None]:
viewCount_avg = df.groupby('category')['viewCount'].mean()
df_smm = viewCount_avg.to_frame(name='avg_view')

In [None]:
df_smm.reset_index(inplace=True)

In [None]:
video_counts = df.groupby('category')['videoId'].count()
df_smm['video_cnts'] = video_counts.to_list()

In [None]:
like_cnts = df.groupby('category')['likeCount'].mean()
df_smm['avg_likes'] = like_cnts.to_list()

In [None]:
cmmnt_cnts = df.groupby('category')['commentCount'].mean()
df_smm['avg_cmmnts'] = cmmnt_cnts.to_list()

In [None]:
sns.barplot(data=df_smm, x='category', y='video_cnts', 
            order=df_smm.sort_values('video_cnts', ascending=False).category)
plt.xticks(rotation=90)
plt.title('Total uploaded number of videos per category')
plt.show()

In [None]:
sns.barplot(data=df_smm, x='category', y='avg_view', 
            order=df_smm.sort_values('avg_view', ascending=False).category)
plt.xticks(rotation=90)
plt.title('Average number of views per category')
plt.show()

In [None]:
sns.barplot(data=df_smm, x='category', y='avg_likes', 
            order=df_smm.sort_values('avg_likes', ascending=False).category)
plt.xticks(rotation=90)
plt.title('Average number of likes per category')
plt.show()

In [None]:
sns.barplot(data=df_smm, x='category', y='avg_cmmnts', 
            order=df_smm.sort_values('avg_cmmnts', ascending=False).category)
plt.xticks(rotation=90)
plt.title('Average number of cmmnts per category')
plt.show()

In [None]:
# rank category by each column
df_smm.set_index('category').rank(ascending=False).reset_index()

<font size=2>This table shows the ranking of categories based on different metrics. Comedy has the highest average_views, average_likes, and average_comments, even though the total number of comedy videos is small. Entertainment category has the highest number of uploaded videos. If someone would want to make videos about some product reviews, I would suggest choosing one category that has less number of uploaded videos (less competitions), but relatively higher comments, likes and views (viewers like to watch and interate with the youtuber). <font>

### Group by publishing month

In [None]:
df['month'] = pd.DatetimeIndex(df['publishedAt']).month

In [None]:
view_cnts = df.groupby('month')['viewCount'].mean()
df_month = view_cnts.to_frame(name='avg_view')
df_month.reset_index(inplace=True)
df_month['avg_likes'] = df.groupby('month')['likeCount'].mean()
df_month['avg_cmmnts'] = df.groupby('month')['commentCount'].mean()
df_month.tail(2)

In [None]:
sns.lineplot(x='month', y='avg_view', data=df_month, markers=True)
plt.title('monthly view counts per video')

<font size=2>It seems that Feburary has extremly high views, while after August, the view counts steadly decreases. <font>

In [None]:
sns.lineplot(x='month', y='avg_likes', data=df_month, markers=True)
plt.title('monthly like counts per video')

In [None]:
sns.lineplot(x='month', y='avg_cmmnts', data=df_month, markers=True)
plt.title('monthly commenting counts per video')

<font size=2>Basically there are less people wathching videos in September and October from the plot of "monthly view counts per video", but the commenting is more active. <font>

### NLP

In [None]:
# extract category Auto $ Vehicles
df_auto = df[df['category']=='Autos & Vehicles'].reset_index()
auto_title_list = df_auto['title'].to_list()

# extract category Scienc & Technology
df_tech = df[df['category']=='Science & Technology'].reset_index()
tech_title_list = df_tech['title'].to_list()


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('words')

In [None]:
# for Autos
tokenizer = nltk.RegexpTokenizer(r"\w{2,}")
auto_words = tokenizer.tokenize(' '.join(auto_title_list))

# for technology
tokenizer = nltk.RegexpTokenizer(r"\w{2,}")
tech_words = tokenizer.tokenize(' '.join(tech_title_list))

#nltk corpus containing all english words
english_words = nltk.corpus.words.words()
#set of stopwrods
stop_words = set(stopwords.words('english'))
extra_words = ['shorts', 'review', '2022', 'india', 'hindi']

In [None]:
auto_clean_words = [w for w in auto_words if w.lower() not in stop_words]
auto_clean_words = [w for w in auto_clean_words if w in english_words]
auto_clean_words = [w for w in auto_clean_words if w.lower() not in extra_words]

tech_clean_words = [w for w in tech_words if w.lower() not in stop_words]
tech_clean_words = [w for w in tech_clean_words if w in english_words]
tech_clean_words = [w for w in tech_clean_words if w.lower() not in extra_words]

In [None]:
auto_fd = nltk.FreqDist(w.lower() for w in auto_clean_words)
wc = WordCloud().generate_from_frequencies(auto_fd)
plt.figure(figsize=[12, 7])
plt.imshow(wc, interpolation='bilinear')

In [None]:
tech_fd = nltk.FreqDist(w.lower() for w in tech_clean_words)
wc = WordCloud().generate_from_frequencies(tech_fd)
plt.figure(figsize=[12, 7])
plt.imshow(wc, interpolation='bilinear')

In [None]:
import re

def clean(text_list):
    clean_list = []
    for text in text_list:
        # remove all non-word characters
        t = re.sub('\W', repl=' ', string=text)
        clean_list.append(' '.join(t.split()).lower())
    return clean_list

auto_title_clean = clean(auto_title_list)
tech_title_clean = clean(tech_title_list)

<font size=5>Auto Brand Extraction<font> 

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['textcat'])
auto_doc = nlp(auto_title_clean[3])
displacy.render(auto_doc, style='dep', jupyter=True)
for tok in auto_doc:
    print(tok.text, "-->",tok.dep_, "-->",tok.pos_, "-->", tok.ent_type_)

In [None]:
pattern = [
    [{'POS':'PROPN', 'OP':'?'}],
    [{'ENT_TYPE':'DATE'}],
    [{'POS':'PROPN', 'OP':'?'}]
]

def find_names(text):
    doc = nlp(text)
    matcher = Matcher(nlp.vocab) 
    matcher.add("matching_1", pattern) 
    matches = matcher(doc)
    name = []
    for i in range(len(matches)):
        span = doc[matches[i][1]:matches[i][2]] 
        name.append(span)
    return name

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['textcat'])
auto_doc = nlp(auto_title_clean[6])
displacy.render(auto_doc, style='dep', jupyter=True)
for tok in auto_doc:
    print(tok.text, "-->",tok.dep_, "-->",tok.pos_, "-->", tok.ent_type_)

In [None]:
title_names = []
for texts in auto_title_clean:
    for t in find_names(texts):
        title_names.append(t.text) #convert span type to str type

In [None]:
auto_title_words = tokenizer.tokenize(' '.join(title_names))
tech_fd = nltk.FreqDist(w for w in auto_title_words)
wc = WordCloud().generate_from_frequencies(tech_fd)
plt.figure(figsize=[12, 7])
plt.imshow(wc, interpolation='bilinear')

<font size='3'>Popular Auto brands are Toyota, BMW, Hyundai, Suzuki, Ford, Audi, Jeep, Tesla, Subaru, Nissan.<font>

<font size=5>Tech Brand Extraction<font>

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['textcat'])
tech_doc = nlp(tech_title_clean[1])
displacy.render(tech_doc, style='dep', jupyter=True)
for tok in tech_doc:
    print(tok.text, "-->",tok.dep_, "-->",tok.pos_, "-->", tok.ent_type_)

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['textcat'])
tech_doc = nlp(tech_title_clean[16])
# displacy.render(tech_doc, style='dep', jupyter=True)
for tok in tech_doc:
    print(tok.text, "-->",tok.dep_, "-->",tok.pos_, "-->", tok.ent_type_)

In [None]:
pattern_tech = [
    [{'DEP':'aux', 'OP':'?'}],
    [{'DEP':'compound', 'POS':'ADJ', 'OP':'{1}'}],
    [{'DEP':'nsubj', 'POS':'PROPN', 'ENT_TYPE':{'IN':['ORG']}, 'OP':'?'}],
    [{'DEP':'ROOT', 'POS':{'IN':['PROPN', 'VERB', 'NOUN']}, 'LOWER':{'NOT_IN':['review']}}],
    [{'DEP':'compound', 'POS':'PROPN', 'ENT_TYPE':{'NOT_IN':['ORG', 'CARDINAL','ORDINAL']}}],
    
]

def find_names(text):
    doc = nlp(text)
    matcher = Matcher(nlp.vocab) 
    matcher.add("matching_tech", pattern_tech) 
    matches = matcher(doc)
    name = []
    for i in range(len(matches)):
        span = doc[matches[i][1]:matches[i][2]] 
        name.append(span)
    return name

In [None]:
tech_title_names = []
for texts in tech_title_clean:
    for t in find_names(texts):
        tech_title_names.append(t.text) #convert span type to str type

In [None]:
tech_title_words = tokenizer.tokenize(' '.join(tech_title_names))
tech_fd = nltk.FreqDist(w for w in tech_title_words)
wc = WordCloud().generate_from_frequencies(tech_fd)
plt.figure(figsize=[12, 7])
plt.imshow(wc, interpolation='bilinear')

In [None]:
<font size='3'>Popular Auto brands are Samsung Galaxy, Iphone, Pixel, Oneplus, Macbook, Vivo, Xiaomi, Motorola.<font>