# This Notebook will show you the
1. Overall distribution of code blocks and md blocks across all notebooks
2. Distribution across ancestor_ids 
3. Distribution across parent_ids
4. top 10 users with most notebboks

In [None]:
import os
import re
import json
import numpy as np
import pandas as pd 
from collections import Counter
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

**#1Code blocks and Md blocks distribution**

In [None]:
df = pd.read_csv("../input/AI4Code/train_orders.csv")

In [None]:
def count_code_md(filename):
    with open("../input/AI4Code/train/"+filename, 'r') as j:
        contents = json.loads(j.read())
        vals = list(contents["cell_type"].values())
        return len(vals), vals.count("code"), vals.count("markdown")

In [None]:
for i, row in enumerate(df.iterrows()):
    df.at[i, 'total_blocks'], df.at[i, 'code_blocks'], df.at[i, 'md_blocks'] = count_code_md(row[1]["id"]+".json")

In [None]:
df_blocks = df.drop(["id", "cell_order"], axis=1, inplace=False)

In [None]:
df_blocks.describe()

In [None]:
plt.hist(df_blocks["total_blocks"], bins=1000)
plt.gca().set(title='Frequency', ylabel='Frequency');

In [None]:
plt.hist(df_blocks["code_blocks"], bins=200)
plt.gca().set(title='Frequency', ylabel='Frequency');

In [None]:
plt.hist(df_blocks["md_blocks"], bins=100)
plt.gca().set(title='Frequency', ylabel='Frequency');

**#2 Distribution across ancestor_ids**

In [None]:
df_ancestors = pd.read_csv("../input/AI4Code/train_ancestors.csv")
gb_aid = df_ancestors.groupby("ancestor_id")

In [None]:
gb_aid.count().sort_values(['id'], ascending=False)

In [None]:
gb_aid = df_ancestors.groupby("ancestor_id").size()
gb_lst = list(gb_aid)
gb_lst = [x for x in gb_lst if x > 5]

In [None]:
plt.hist(gb_lst, bins=100) 
plt.ylabel('Frequency')

**#3 Distribution across parent_ids**

In [None]:
df_ancestors = pd.read_csv("../input/AI4Code/train_ancestors.csv")
gb_pid = df_ancestors.groupby("parent_id")

In [None]:
gb_pid.count().sort_values(['id'], ascending=False)

In [None]:
gb_pid = df_ancestors.groupby("parent_id").size()
gbp_lst = list(gb_pid)
gbp_lst = [x for x in gbp_lst if x > 5]

In [None]:
plt.hist(gbp_lst, bins=100) 
plt.ylabel('Frequency')

**#4 Top 10 users with most notebboks**

In [None]:
def extract_user(file):
    json_open = open(f'../input/AI4Code/train/{file}', 'r')
    json_load = json.load(json_open)
    json_load = '\n'.join(json_load['source'].values())
    res = re.findall(r'www.kaggle.com/+[a-zA-Z0-9_]+/', json_load)
    res = set([r.split('/')[-2] for r in res])
    res = [r for r in res if r not in ['c', 'kernels', 'learn']]
    return res

In [None]:
files = os.listdir('../input/AI4Code/train')
result = Parallel(n_jobs=4, verbose=1)(delayed(extract_user)(file) for file in files)
result = sum(result, [])
count = Counter(result)

In [None]:
for i, c in enumerate(count.most_common(10)):
    print(i+1, c)