# Explore your notebook stats using Meta-Kaggle

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time
import os

# plots
import matplotlib.pyplot as plt

# file overview
!ls -l /kaggle/input/meta-kaggle/

In [None]:
# load user table
t1 = time.time()
df_users = pd.read_csv('../input/meta-kaggle/Users.csv')
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2),'secs')

In [None]:
print('Number of registered users:', df_users.shape[0])

In [None]:
# extract user by name to get id
my_user = df_users[df_users.UserName=='docxian']
my_user

In [None]:
my_id = my_user.Id.values[0]
print('User ID:',my_id)

In [None]:
# load kernels/notebooks table
df_kernels = pd.read_csv('../input/meta-kaggle/Kernels.csv')
print('Number of notebooks:', df_kernels.shape[0])

In [None]:
# and filter by selected user
df_kernels_select = df_kernels[df_kernels.AuthorUserId==my_id]
# simplify table by removing some columns
df_kernels_select = df_kernels_select.drop(['AuthorUserId','CurrentKernelVersionId',
                                            'ForkParentKernelVersionId','FirstKernelVersionId',
                                            'IsProjectLanguageTemplate','ForumTopicId',
                                            'CreationDate'], axis=1)
# conversions
df_kernels_select.EvaluationDate = pd.to_datetime(df_kernels_select.EvaluationDate)
df_kernels_select.MadePublicDate = pd.to_datetime(df_kernels_select.MadePublicDate)
df_kernels_select.MedalAwardDate = pd.to_datetime(df_kernels_select.MedalAwardDate)

# show preview
df_kernels_select

# Statistics

In [None]:
# sum of views/comments/votes
n_NB = df_kernels_select.shape[0]
n_View = df_kernels_select.TotalViews.sum()
n_Votes = df_kernels_select.TotalVotes.sum()
n_Comments = df_kernels_select.TotalComments.sum()

print('Number of public Notebooks :', n_NB)
print()
print('Total Views    :', n_View)
print('Total Comments :', n_Comments)
print('Total Votes    :', n_Votes)
print()
print('Views per Notebook    :', np.round(n_View/n_NB,2))
print('Comments per Notebook :', np.round(n_Comments/n_NB,2))
print('Votes per Notebook    :', np.round(n_Votes/n_NB,2))
print()
print('Views per Vote :', np.round(n_View/n_Votes,2))


In [None]:
# medal stats (1: gold, 2: silver, 3: bronze)
df_kernels_select.Medal.value_counts()

In [None]:
# add custom features
df_kernels_select['ViewsPerVote'] = np.round(df_kernels_select.TotalViews/df_kernels_select.TotalVotes,2)

In [None]:
# plot numerical features
features_num = ['TotalViews','TotalComments','TotalVotes','ViewsPerVote']

for f in features_num:
    df_kernels_select[f].plot(kind='hist', bins=20)
    plt.title(f)
    plt.grid()
    plt.show()

In [None]:
# export notebooks table to CSV
df_kernels_select.to_csv('df_kernels_select.csv')

### Medal development

In [None]:
# extract kernels with medals
df_kernels_select_medals = df_kernels_select[~df_kernels_select.Medal.isna()].copy()
df_kernels_select_medals = df_kernels_select_medals.sort_values(by='MedalAwardDate', ascending=True).reset_index(drop=True) 

# and plot development over time
fig, ax = plt.subplots(figsize=(18,6))
ax.plot(df_kernels_select_medals.MedalAwardDate, df_kernels_select_medals.index)
ax.xaxis.set_major_locator(plt.MaxNLocator(20))
plt.xticks(rotation=90)
plt.title('Development of Medals')
plt.grid()
plt.show()

# Top 10

### Votes

In [None]:
# top 10 by votes
df_kernels_select.nlargest(n=10, columns='TotalVotes')

### Views

In [None]:
# top 10 by views
df_kernels_select.nlargest(n=10, columns='TotalViews')

### Comments

In [None]:
# top 10 by comments
df_kernels_select.nlargest(n=10, columns='TotalComments')

# Deep dive into votes

In [None]:
# load table with notebooks including versions
df_kernels_versions = pd.read_csv('../input/meta-kaggle/KernelVersions.csv')
# filter for current user
df_kernels_versions = df_kernels_versions[df_kernels_versions.AuthorUserId==my_id]
# convert date
df_kernels_versions.EvaluationDate = pd.to_datetime(df_kernels_versions.EvaluationDate)

df_kernels_versions.head()

In [None]:
# check sum of votes
print('Total votes:', df_kernels_versions.TotalVotes.sum())

#### Less than total votes above, but it's nevertheless ok, the difference can be explained by additional votes from accepted task submissions!

In [None]:
# evaluate votes by date
votes_by_date = pd.DataFrame(df_kernels_versions.groupby('EvaluationDate', as_index=True)['TotalVotes'].sum())

# and plot
fig, ax = plt.subplots(figsize=(18,6))
ax.scatter(votes_by_date.index, votes_by_date.TotalVotes,
           alpha=0.5)
ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-labels
plt.xticks(rotation=90)
plt.title('Votes by date')
plt.grid()
plt.show()