# Importing libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import re
import isodate
import seaborn as sns
from pymystem3 import Mystem
from wordcloud import *
from nltk.probability import FreqDist
import re
from nltk.corpus import stopwords

In [None]:
del df_ukr['Unnamed: 0']
del df_usa['Unnamed: 0']
del df_blr['Unnamed: 0']
del df_rus['Unnamed: 0']
del df_rus['video_id']

In [None]:
df_list = [df_rus, df_ukr, df_blr, df_usa]
df_list_rus = [df_rus, df_ukr, df_blr]

# Data Cleaning

Some of videos don't contain description or tags, so we have to fill it by empty string

In [None]:
def fill_empty_decription_and_tags(df):
    df["video_description"] = df["video_description"].fillna(value="")
    df["video_tags"] = df["video_tags"].fillna(value="")

In [None]:
for df in df_list:
    fill_empty_decription_and_tags(df = df)

The type of duration isn't suitable to make analyses, so we'll transform it into float

In [None]:
def transform_duration(df):
  df['duration'] = list(map(isodate.parse_duration, df['duration']))
  df['duration'] = df['duration'] / np.timedelta64(1, 'm')

In [None]:
for df in df_list:
  transform_duration(df)

To analyze how views and titles are connected, we'll create a feature title length

In [None]:
def add_title_length_column(df):
  df['TL'] = df["video_title"].apply(lambda x: len(x))

In [None]:
for df in df_list:
  add_title_length_column(df)

In df_blr there're 3 records with NaN values, so we'll drop 'em

In [None]:
df_blr.dropna(inplace = True)

In [None]:
for df in df_list:
  print(25 * '-')
  print(df.isnull().sum())

## **Data Analyse**

In [None]:
def get_heatmaps(df, country):
  fig, ax = plt.subplots(figsize = (11,10)) 
  _ = sns.heatmap(df.corr(),annot=True,cmap='coolwarm_r')
  _ = ax.set_title(country)
  fig.savefig(country + "heatmap.png")

In [None]:
def get_popular_vidz_df(df):
  return df[df['video_view_count'] >= 1e6]

In [None]:
def get_less_popular_vidz_df(df):
  return df[df['video_view_count'] < 1e6]

In [None]:
df_rus_mln = get_popular_vidz_df(df_rus)
df_ukr_mln = get_popular_vidz_df(df_ukr)
df_blr_mln = get_popular_vidz_df(df_blr)
df_usa_mln = get_popular_vidz_df(df_usa)
df_popular_list = [df_rus_mln, df_ukr_mln, df_blr_mln, df_usa_mln]

In [None]:
df_rus_less = get_less_popular_vidz_df(df_rus)
df_ukr_less = get_less_popular_vidz_df(df_ukr)
df_blr_less = get_less_popular_vidz_df(df_blr)
df_usa_less = get_less_popular_vidz_df(df_usa)
df_less_list = [df_rus_less, df_ukr_less, df_blr_less, df_usa_less]

In [None]:
for df in df_popular_list:
  print(df.shape, len(df['channel_title'].unique()))

In [None]:
for df in df_less_list:
  print(df.shape, len(df['channel_title'].unique()))

In [None]:
get_heatmaps(df_usa_mln, 'США')
get_heatmaps(df_usa, 'США')

In [None]:
def describe_durations(df):
  return df.describe()['duration']

In [None]:
def percentage_of_million_views(df):
  return 100 * df[df['video_view_count'] > 1e6]['video_view_count'].count() / df['video_view_count'].count() 

In [None]:
for df in df_list:
  print(percentage_of_million_views(df))

In [None]:
def describe_LR(df):
    return df.describe()['LR']

In [None]:
def describe_TL(df):
    return df.describe()['TL']

In [None]:
def describe_DR(df):
    return df.describe()['DR']

In [None]:
def describe_VR(df):
    return df.describe()['VR']

In [None]:
for df in df_popular_list:
  print(describe_LR(df))
  print('*' * 30)

In [None]:
for df in df_less_list:
  print(describe_LR(df))
  print('*' * 30)

## **Distribution of amount of videos by title length in the country**

In [None]:
def title_length_bar(df, country):
  fig, ax = plt.subplots()
  _ = sns.distplot(df["TL"], kde=False, rug=False, color='r', hist_kws={'alpha': 1}, ax=ax)
  _ = ax.set(xlabel="Title length", ylabel="Number of videos", xticks=range(0, 110, 10))
  _ = ax.set_title(country)
  fig.savefig(country + 'title.png')

In [None]:
title_length_bar(df_blr_mln, 'Беларусь(млн)')
title_length_bar(df_blr, 'Беларусь')

## **Distribution of amount of videos by video duration in the country**

In [None]:
def duration_bar(df, country):
  fig, ax = plt.subplots()
  _ = sns.distplot(df["duration"], kde=False, rug=False, color='r', hist_kws={'alpha': 1}, ax=ax)
  _ = ax.set(xlabel="Duration", ylabel="Number of videos")
  _ = ax.set_title(country)
  fig.savefig(country + 'duration.png')

In [None]:
duration_bar(df_rus_mln, 'Россия(млн)')
duration_bar(df_rus, 'Россия')

# **Distribution of amount of videos by categories in the country**

In [None]:
def weekday_bar(df, country):
  cdf = df['weekday'].value_counts().to_frame().reset_index().rename(columns={"index": "Weekday of publishing", "weekday": "Number of videos"})
  fig, ax = plt.subplots()
  _ = sns.barplot(x="Weekday of publishing", y="Number of videos", data=cdf, palette=sns.color_palette('tab10'))
  fig.savefig(country + ".png")

In [None]:
weekday_bar(df = df_usa, country = "BLR")

In [None]:
weekday_bar(df = df_usa_mln, country = "BLR")

# **Distribution of amount of videos by hour of publishing in the country**

In [None]:
for df in df_list:
  df['publishing_hour'] = df['publish_date'].apply(lambda x: x[11:13])

In [None]:
for df in df_popular_list:
  df['publishing_hour'] = df['publish_date'].apply(lambda x: x[11:13])

In [None]:
def hour_bar(df, country):
  cdf = df['publishing_hour'].value_counts().to_frame().reset_index().rename(columns={"index": "Hours of publishing", "publishing_hour": "Number of videos"})
  fig, ax = plt.subplots()
  _ = sns.barplot(x="Hours of publishing", y="Number of videos", data=cdf, palette=sns.color_palette('tab10'), ax=ax)
  _ = ax.set(xlabel="Publishing Hour", ylabel="No. of videos")
  fig.savefig(country + "hour.png")

In [None]:
hour_bar(df = df_rus, country = 'BLR')

In [None]:
hour_bar(df = df_ris, country = 'RUS')

# **Distribution of amount of videos by categories in the country**

In [None]:
topics_dict = {2: "Autos & Vehicles", 1: "Film & Animation", 10: "Music", 15: "Pets & Animals", 17: "Sports", 18: "Short Movies", 19: "Travel & Events", 20: "Gaming", 21: "Videoblogging", 22: "People & Blogs", 23: "Comedy", 24: "Entertainment", 25: "News & Politics", 26: "Howto & Style", 27: "Education", 28: "Science & Technology", 29: "Nonprofits & Activism", 30: "Movies", 31: "Anime/Animation", 32: "Action/Adventure", 33: "Classics", 34: "Comedy", 35: "Documentary", 36: "Drama", 37: "Family", 38: "Foreign", 39: "Horror", 40: "Sci-Fi/Fantasy", 41: "Thriller", 42: "Shorts", 43: "Shows", 44: "Trailers"}

In [None]:
def topic_bar(df, country):
    topic = df['video_category'].value_counts()
    plt.title('Распределение видео по темам в ' + str(country))
    plt.xlabel('Темы')
    plt.ylabel('Количество роликов')
    topic.plot(kind = 'bar', label = 'Количество роликов данной категории')
    plt.legend(fontsize = 10)
    plt.savefig(country + 'categories.png')

In [None]:
topic_bar(country = 'Беларуси(млн)', df = df_blr_mln)

## Using CAPS words

In [None]:
def is_text_with_caps(text):
    for word in text.split():
        if word.isupper():
            return True
    return False

In [None]:
for df in df_list:
    df["caps"] = df["video_title"].apply(is_text_with_caps)

In [None]:
for df in df_popular_list:
    df["caps"] = df["video_title"].apply(is_text_with_caps)

In [None]:
def caps_pie(df, country):
  caps_dict = df["caps"].value_counts().to_dict()
  fig, ax = plt.subplots()
  _ = ax.pie([caps_dict[False], caps_dict[True]], labels=['Нет', 'Да'], colors=['b', 'y'], startangle=60, )
  _ = ax.axis('equal')
  _ = ax.set_title(country)
  fig.savefig(country + 'pie.png')

In [None]:
def views_in_caps(df):
  return df[df['caps'] == True]['video_view_count'].sum() / df['video_view_count'].sum()

In [None]:
for df in df_list:
  print(views_in_caps(df))

In [None]:
for df in df_popular_list:
  print(views_in_caps(df))

# Most common words in video descriptions, titles and tags (Russia, Ukraine, Belarus)

Copy titles, descriptions, tags to manipulate with 'em 

In [None]:
for df in df_list_rus:
  df['video_title_lemmatized'] = df.video_title
  df['video_description_lemmatized'] = df.video_description
  df['video_tags_lemmatized'] = df.video_tags

Lemmatizing 

In [None]:
regex = re.compile("[А-Яа-я]+")
mystopwords = stopwords.words('russian') + ['это', 'наш' , 'тыс', 'млн', 'млрд', 'также',  'т', 'д']
mystoplemmas = ['который','прошлый','сей', 'свой', 'наш', 'мочь', 'такой']


m = Mystem()

def words_only(text, regex=regex):
    return " ".join(regex.findall(text))

def remove_stopwords(text, mystopwords = mystopwords):
    try:
        return " ".join([token for token in text.split() if not token in mystopwords])
    except:
        return ""

def lemmatize(text, mystem=m):
    try:
        return "".join(m.lemmatize(text)).strip()  
    except:
        return " "
    
def  remove_stoplemmas(text, mystoplemmas = mystoplemmas):
    try:
        return " ".join([token for token in text.split() if not token in mystoplemmas])
    except:
        return ""

for df in df_list_rus:

  df.video_title_lemmatized = df.video_title_lemmatized.apply(lemmatize)
  df.video_title_lemmatized = df.video_title_lemmatized.str.lower()
  df.video_title_lemmatized = df.video_title_lemmatized.apply(words_only)
  df.video_title_lemmatized = df.video_title_lemmatized.apply(remove_stopwords)  
  df.video_title_lemmatized = df.video_title_lemmatized.apply(remove_stoplemmas)

  df.video_description_lemmatized = df.video_description_lemmatized.apply(lemmatize)
  df.video_description_lemmatized = df.video_description_lemmatized.str.lower()
  df.video_description_lemmatized = df.video_description_lemmatized.apply(words_only)
  df.video_description_lemmatized = df.video_description_lemmatized.apply(remove_stopwords)  
  df.video_description_lemmatized = df.video_description_lemmatized.apply(remove_stoplemmas)

  df.video_tags_lemmatized = df.video_tags_lemmatized.apply(lemmatize)
  df.video_tags_lemmatized = df.video_tags_lemmatized.str.lower()
  df.video_tags_lemmatized = df.video_tags_lemmatized.apply(words_only)
  df.video_tags_lemmatized = df.video_tags_lemmatized.apply(remove_stopwords)  
  df.video_tags_lemmatized = df.video_tags_lemmatized.apply(remove_stoplemmas)

In [None]:
def creating_frequency_dict(df):
  lemmata_titles = list()
  lemmata_descriptions = list()
  lemmata_tags = list()
  for index, row in df.iterrows():
    lemmata_tags += row['video_tags_lemmatized'].split()
    lemmata_titles += row['video_title_lemmatized'].split()
    lemmata_descriptions += row['video_description_lemmatized'].split()
  fd_tags = FreqDist(lemmata_tags)
  fd_titles = FreqDist(lemmata_titles)
  fd_descriptions = FreqDist(lemmata_descriptions)
  return fd_tags, fd_titles, fd_descriptions

In [None]:
fd_tags, fd_titles, fd_descriptions  = creating_frequency_dict(df = df_blr_mln)

In [None]:
def show_word_cloud_tags():
  word_freq_tags = [i for i in fd_tags.most_common(100)] 
  wd = WordCloud(background_color='white')
  wd.generate_from_frequencies(dict(word_freq_tags))
  plt.figure()
  plt.imshow(wd, interpolation = 'bilinear')
  plt.axis('off')
  plt.savefig('tags_blr.png')
  print(fd_tags.most_common(10))

In [None]:
def show_word_cloud_titles():
  word_freq_titles = [i for i in fd_titles.most_common(100)] 
  wd = WordCloud(background_color='white')
  wd.generate_from_frequencies(dict(word_freq_titles))
  plt.figure()
  plt.imshow(wd, interpolation = 'bilinear')
  plt.axis('off')
  plt.savefig('titles_blr.png')
  print(fd_titles.most_common(10))

In [None]:
def show_word_cloud_descriptions():
  word_freq_descriptions = [i for i in fd_descriptions.most_common(100)] 
  wd = WordCloud(background_color='white')
  wd.generate_from_frequencies(dict(word_freq_descriptions))
  plt.figure()
  plt.imshow(wd, interpolation = 'bilinear')
  plt.axis('off')
  plt.savefig('descriptions_blr.png')
  print(fd_descriptions.most_common(10))