In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import os

Initialize core requirements for the notebook.

In [2]:
directory = "/archive/"
video_string = "videos.csv"
json_string = "_category_id.json"
country_code = set()
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        country_code.add(filename[0:2])
#         print(os.path.join(dirname, filename))
print("Country codes:",", ".join(country_code))

For EDA, we use US data as sample and perform exploratory data analysis

In [4]:
us_video_dir = directory + "FR" + video_string;
us_json_dir = directory + "FR" + json_string;
us_video = pd.read_csv(us_video_dir)
us_json = pd.read_json(us_json_dir)

In [5]:
print(us_video.shape)
print(us_video.nunique())
#Looking for Nulls and type of our data
us_video.info()

In [6]:
us_video.head()

In [7]:
#Normalizing data to read, so plots are simpler
us_video['likes_log'] = np.log(us_video['likes'] + 1)
us_video['views_log'] = np.log(us_video['views'] + 1)
us_video['dislikes_log'] = np.log(us_video['dislikes'] + 1)
us_video['comment_log'] = np.log(us_video['comment_count'] + 1)

In [8]:
print("Views quantiles")
print(us_video['views'].quantile([.01,.25,.5,.75,.99]))
print("")
print("Likes quantiles")
print(us_video['likes'].quantile([.01,.25,.5,.75,.99]))
print("")
print("Dislikes quantiles")
print(us_video['dislikes'].quantile([.01,.25,.5,.75,.99]))
print("")
print("Comment quantiles")
print(us_video['comment_count'].quantile([.01,.25,.5,.75,.99]))

In [11]:
plt.figure(figsize = (12,6))

plt.subplot(221)
g1 = sns.distplot(us_video['views_log'])
g1.set_title("VIEWS LOG DISTRIBUITION", fontsize=16)

plt.subplot(224)
g4 = sns.distplot(us_video['comment_log'])
g4.set_title("COMMENTS LOG DISTRIBUITION", fontsize=16)

plt.subplot(223)
g3 = sns.distplot(us_video['dislikes_log'], color='r')
g3.set_title("DISLIKES LOG DISTRIBUITION", fontsize=16)

plt.subplot(222)
g2 = sns.distplot(us_video['likes_log'],color='green')
g2.set_title('LIKES LOG DISTRIBUITION', fontsize=16)

plt.subplots_adjust(wspace = 0.2, hspace = 0.4,top = 0.9)

plt.show()

In [12]:
us_video['category_name'] = np.nan

us_video.loc[(us_video["category_id"] == 1),"category_name"] = 'Film and Animation'
us_video.loc[(us_video["category_id"] == 2),"category_name"] = 'Cars and Vehicles'
us_video.loc[(us_video["category_id"] == 10),"category_name"] = 'Music'
us_video.loc[(us_video["category_id"] == 15),"category_name"] = 'Pets and Animals'
us_video.loc[(us_video["category_id"] == 17),"category_name"] = 'Sport'
us_video.loc[(us_video["category_id"] == 19),"category_name"] = 'Travel and Events'
us_video.loc[(us_video["category_id"] == 20),"category_name"] = 'Gaming'
us_video.loc[(us_video["category_id"] == 22),"category_name"] = 'People and Blogs'
us_video.loc[(us_video["category_id"] == 23),"category_name"] = 'Comedy'
us_video.loc[(us_video["category_id"] == 24),"category_name"] = 'Entertainment'
us_video.loc[(us_video["category_id"] == 25),"category_name"] = 'News and Politics'
us_video.loc[(us_video["category_id"] == 26),"category_name"] = 'How to and Style'
us_video.loc[(us_video["category_id"] == 27),"category_name"] = 'Education'
us_video.loc[(us_video["category_id"] == 28),"category_name"] = 'Science and Technology'
us_video.loc[(us_video["category_id"] == 29),"category_name"] = 'Non Profits and Activism'
us_video.loc[(us_video["category_id"] == 25),"category_name"] = 'News & Politics'

In [13]:
print("Category Name count")
print(us_video.category_name.value_counts()[:5])

plt.figure(figsize = (14,9))

plt.subplot(211)
g = sns.countplot('category_name', data=us_video, palette="Set1")
g.set_xticklabels(g.get_xticklabels(),rotation=45)
g.set_title("Counting the Video Category's ", fontsize=15)
g.set_xlabel("", fontsize=12)
g.set_ylabel("Count", fontsize=12)

plt.subplot(212)
g1 = sns.boxplot(x='category_name', y='views_log', data=us_video, palette="Set1")
g1.set_xticklabels(g.get_xticklabels(),rotation=45)
g1.set_title("Views Distribuition by Category Names", fontsize=20)
g1.set_xlabel("", fontsize=15)
g1.set_ylabel("Views(log)", fontsize=15)

plt.subplots_adjust(hspace = 0.9, top = 0.9)

plt.show()

In [14]:
plt.figure(figsize = (14,6))

g = sns.boxplot(x='category_name', y='comment_log', data=us_video, palette="Set1")
g.set_xticklabels(g.get_xticklabels(),rotation=45)
g.set_title("Comments Distribuition by Category Names", fontsize=15)
g.set_xlabel("", fontsize=12)
g.set_ylabel("Comments Count(log)", fontsize=12)

plt.show()

In [15]:
plt.figure(figsize = (14,6))

g = sns.boxplot(x='category_name', y='likes_log', data=us_video, palette="Set1")
g.set_xticklabels(g.get_xticklabels(),rotation=45)
g.set_title("Likes Distribuition by Category Names ", fontsize=15)
g.set_xlabel("", fontsize=12)
g.set_ylabel("Likes(log)", fontsize=12)
plt.show()

In [16]:
print("Category Name count")
print(us_video.category_name.value_counts()[:5])

plt.figure(figsize = (14,9))

plt.subplot(211)
g = sns.countplot('category_name', data=us_video, palette="Set1")
g.set_xticklabels(g.get_xticklabels(),rotation=45)
g.set_title("Counting the Video Category's ", fontsize=15)
g.set_xlabel("", fontsize=12)
g.set_ylabel("Count", fontsize=12)

plt.subplot(212)
g1 = sns.boxplot(x='category_name', y='views_log', data=us_video, palette="Set1")
g1.set_xticklabels(g.get_xticklabels(),rotation=45)
g1.set_title("Views Distribuition by Category Names", fontsize=20)
g1.set_xlabel("", fontsize=15)
g1.set_ylabel("Views(log)", fontsize=15)

plt.subplots_adjust(hspace = 0.9, top = 0.9)

plt.show()

In [17]:
plt.figure(figsize = (14,8))
plt.subplots_adjust(wspace = 0.2, hspace = 0.4,top = 0.9)

plt.subplot(2,2,1)
g = sns.countplot(x='comments_disabled', data=us_video)
g.set_title("Comments Disabled", fontsize=16)

plt.subplot(2,2,2)
g1 = sns.countplot(x='ratings_disabled', data=us_video)
g1.set_title("Rating Disabled", fontsize=16)

plt.subplot(2,2,3)
g2 = sns.countplot(x='video_error_or_removed', data=us_video)
g2.set_title("Video Error or Removed", fontsize=16)
plt.show()

In [18]:
plt.figure(figsize = (12,10))

plt.subplot(221)
g1 = sns.distplot(us_video[us_video['comments_disabled'] == True]['views_log'], 
                  hist=False, label='Comm_dis')
g1 = sns.distplot(us_video[us_video['ratings_disabled'] == True]['views_log'], 
                  hist=False, label='Rati_dis')
g1 = sns.distplot(us_video[us_video['video_error_or_removed'] == True]['views_log'], 
                  hist=False, label='vide_rmv_err')
g1.set_title("VIEWS LOG DISTRIBUITION", fontsize=16)

plt.subplot(222)
g4 = sns.distplot(us_video[us_video['comments_disabled'] == True]['comment_log'],
                  hist=False, label='Comm_dis')
g4 = sns.distplot(us_video[us_video['ratings_disabled'] == True]['comment_log'], 
                  hist=False, label='Rati_dis')
g4 = sns.distplot(us_video[us_video['video_error_or_removed'] == True]['comment_log'], 
                  hist=False, label='vide_rmv_err')
g4.set_title("COMMENTS LOG DISTRIBUITION", fontsize=16)

plt.subplot(223)
g3 = sns.distplot(us_video[us_video['comments_disabled'] == True]['dislikes_log'], 
                  hist=False, label='Comm_dis')
g3 = sns.distplot(us_video[us_video['ratings_disabled'] == True]['dislikes_log'], 
                  hist=False, label='Rati_dis')
g3 = sns.distplot(us_video[us_video['video_error_or_removed'] == True]['dislikes_log'], 
                  hist=False, label='vide_rmv_err')
g3.set_title("DISLIKES LOG DISTRIBUITION", fontsize=16)

plt.subplot(224)
g2 = sns.distplot(us_video[us_video['comments_disabled'] == True]['likes_log'], 
                  hist=False, label='Comm_dis')
g2 = sns.distplot(us_video[us_video['ratings_disabled'] == True]['likes_log'], 
                  hist=False, label='Rati_dis')
g2 = sns.distplot(us_video[us_video['video_error_or_removed'] == True]['likes_log'], 
                  hist=False, label='vide_rmv_err')
g2.set_title('LIKES LOG DISTRIBUITION', fontsize=16)

plt.subplots_adjust(wspace = 0.2, hspace = 0.3,top = 0.9)
plt.legend()
plt.show()

In [21]:
us_video['like_rate'] =  us_video ['likes'] / us_video['views'] * 100
us_video['dislike_rate'] =  us_video ['dislikes'] / us_video['views'] * 100
us_video['dislike_rate'] =  us_video ['dislikes'] / us_video['views'] * 100
us_video['comment_rate'] =  us_video ['comment_count'] / us_video['views'] * 100 

plt.figure(figsize = (10,8))

#Let's verify the correlation of each value
sns.heatmap(us_video[['like_rate', 'dislike_rate', 'comment_rate', 'comment_log',
         'views_log','likes_log','dislikes_log', "category_name"]].corr(), annot=True)
plt.show()

We mow perform Model Training