# Detecting Empty Notebooks

Just a quick demo to back up a bug(?) report [here](https://www.kaggle.com/product-feedback/266800).


In [None]:
from jt_mk_utils import *

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import HTML, Image, display

In [None]:
plt.rc("figure", figsize=(10, 5))
plt.rc("font", size=13)

In [None]:
ver = read_kernel_versions()
ver.columns

In [None]:
gb = ver.groupby('ScriptId')

stats = pd.concat([
    gb.CreationDate.min().rename('Date'),
    gb.Title.first(),
    gb.ParentScriptVersionId.max(),
    gb.TotalVotes.sum().rename('VoteSum'),
    gb.VersionNumber.count().rename('VersionCount'),
    gb.size().rename('VersionEntries')
], axis=1)

empty = stats.query('VersionCount==0')

In [None]:
kernels = read_kernels(filter=('Id', empty.index)).set_index('Id')
empty = empty.join(kernels)

In [None]:
users = read_users(filter=('Id', empty.AuthorUserId)).set_index('Id')
empty = empty.join(users, on='AuthorUserId', how='inner')
empty.to_csv('EmptyNotebooks.csv')

In [None]:
display(HTML(
    f'there are {len(ver)} versions of public notebooks<br/>'
    f'there are {ver.ScriptId.nunique()} public notebooks<br/>'
    f'there are {(stats.VersionCount==0).sum()} empty public notebooks<br/>'
))

# Sample Titles

Many default titles like `Notebook9ca0a8389d`

In [None]:
print(' // '.join(empty.Title.sample(n=100, random_state=42)))

# Votes for Empty Notebooks

In [None]:
empty.TotalVotes.sum()

In [None]:
empty['Url'] = "https://www.kaggle.com/" + empty.UserName + "/" + empty.CurrentUrlSlug
empty['Link'] = empty.apply(lambda r: '<a href="{Url}">{Title}</a>'.format(**r), 1)

In [None]:
SHOW = ['Date', 'Link', 'DisplayName', 'TotalVotes', 'Medal', 'VersionEntries']
MEDALS = {
    1.: '&#129351;',
    2.: '&#129352;',
    3.: '&#129353;',
}
empty.sort_values('TotalVotes', ascending=False)[SHOW].head(200).style.format({'Medal': MEDALS.get}, na_rep='')

# Empty Notebooks Over Time

In [None]:
empty.Date.dt.year.value_counts().sort_index().plot(title='Empty Notebooks Over Time');

# Conclusions

Try following the links in the table - they are *mostly* empty!

Well, these could be hidden from view - there is nothing to see - I do not understand why [*Hotness* sometimes ranks these *hotter* than valid work](https://www.kaggle.com/product-feedback/266800)?!