In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from tqdm import tqdm
from pathlib import Path
import re
import os

pd.options.display.width = 180
pd.options.display.max_colwidth = 100

data_dir = Path("../input/AI4Code")

rc = {"axes.spines.left" : True,
      "axes.spines.right" : False,
      "axes.spines.bottom" : True,
      "axes.spines.top" : False,
      "xtick.bottom" : True,
      "xtick.labelbottom" : True,
      "ytick.labelleft" : True,
      "ytick.left" : True,
      "figure.subplot.hspace" : 0.7,
    "figure.titleweight" : "bold",
    "axes.titleweight" : "bold",
     "font.weight" : "bold"}
plt.rcParams.update(rc)

In [None]:
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

# Subset of training due to its large size
NUM_TRAIN = 20000
paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
#paths_train = list((data_dir / 'train').glob('*.json'))

notebooks_train = [read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')]
# Get notebooks
df_notebooks = (pd.concat(notebooks_train).set_index('id', append=True).swaplevel().sort_index(level='id', 
                                                                                               sort_remaining=False)) 

# Get correct order of cells in notebooks                                                                                          
df_orders = pd.read_csv(data_dir/'train_orders.csv', index_col='id')
df_orders = df_orders.squeeze().str.split(" ").explode().to_frame()
df_orders["rank"] = pd.Series([np.arange(x) for x in df_orders.groupby("id").count()["cell_order"]]).explode().to_numpy()

df = df_notebooks.reset_index().merge(df_orders.reset_index().rename(columns = {'cell_order':'cell_id'}), 
                       how='inner', on=['id','cell_id'])

# Get ancestors for notebooks
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')

# Final combined dataframe
df = df.merge(df_ancestors, on="id").sort_values(["id", "rank"]).set_index(["id", "cell_id"])

# Dataframe for count information - Used in EDA
mkd = df[df["cell_type"] == "markdown"].groupby(by=["id"]).count().source
code = df[df["cell_type"] == "code"].groupby(by=["id"]).count().source
df_counts = pd.concat([mkd, code], axis=1)
df_counts.columns = ['markdown_count', 'code_count']
df_counts["tot"] = df_counts.markdown_count+df_counts.code_count
df_counts["ratio"] = df_counts.code_count / df_counts.tot

In [None]:
df.head()

In [None]:
df["cell_type"].unique()

In [None]:
df.isna().sum()

## Most of cells in these notebooks contain code

In [None]:
fig1, ax = plt.subplots(figsize=(8,8))
wedges, texts, autotexts = ax.pie([df_counts.code_count.sum(), df_counts.markdown_count.sum()], shadow=True, 
                                  explode=(0, 0.1), labels=["Code", "Markdown"], 
                                  autopct='%1.1f%%', textprops=dict(color="w"), colors=["#008b8b", "#8b0000"])

plt.setp(autotexts, size=15, weight="bold")
ax.legend(wedges, ["Code", "Markdown"],
          loc="center left",
          bbox_to_anchor=(1, 0, 0.5, 1), prop={'size': 13})

ax.set_title("Proportion of Code vs Markdown", size=20);

## Length of notebooks (number of cells)
Most of the notebooks have less than 160 cells. There are some notebooks having more than 200 cells

In [None]:
fig, axs = plt.subplots(2, figsize=(10,10), gridspec_kw={"height_ratios": [7, 1]})
fig.subplots_adjust(hspace=0.2)

count = df.groupby(by=["id"]).count()["source"].values
axs[0].hist(count, bins=200, color="#8b0000")

axs[0].set_xlabel("# of Cells", weight="bold")
axs[0].set_ylabel("Frequency", weight="bold")
axs[0].set_title(f"Length of Notebooks", size=20);

# Hide axis spine, ticks
for txt in ["top","right","left","bottom"]:
        axs[1].spines[txt].set_visible(False)
axs[1].xaxis.set_visible(False)
axs[1].yaxis.set_visible(False)

## Shortest and longest notebooks
Every notebooks have more than or equal 2 cells. The longest notebook has 800 cells.

In [None]:
colors = sns.color_palette("rocket_r", n_colors=20)

fig, axs = plt.subplots(1,2, figsize=(15, 10))

count = df_counts.sort_values("tot").head(10).tot
axs[0].bar(range(10), count.values, color=colors[:10])
axs[0].set_ylim(0, 10)
axs[0].set_title("Shortest Notebooks", size=20)

count = df_counts.sort_values("tot").tail(10).tot
axs[1].bar(range(10), count.values, color=colors[10:])
axs[1].set_title("Longest Notebooks", size=20)

for i in range(2):
    axs[i].set_xlabel("Notebook", weight="bold", size=13)
    axs[i].set_xticks([])
    
axs[0].set_ylabel("# of Cells", weight="bold", size=16)

## The first cell of notebook is often a markdown


In [None]:
# Matrix for 2-cell notebooks
matrix = pd.DataFrame(data = np.zeros((2,2)), columns=["code", "markdown"], index=["code", "markdown"])
for notebook in df_counts[df_counts["tot"]==2].index:
    cell1 = df.loc[notebook].cell_type[0]
    cell2 = df.loc[notebook].cell_type[1]
    matrix.loc[cell1, cell2] = 1 + matrix.loc[cell1, cell2]

# setup the figure and axes
fig = plt.figure(figsize=(15, 15))
# fake data
_x = np.arange(2)
_y = np.arange(2)
_xx, _yy = np.meshgrid(_x, _y)
x, y = _xx.ravel(), _yy.ravel()

top = x + y
bottom = np.zeros_like(top)
width = depth = 1

# Pie chart
ax2 = fig.add_subplot(222)
wedges, texts, autotexts = ax2.pie(df[df["rank"]==0]["cell_type"].value_counts(), shadow=True, 
                                  explode=(0, 0.05), 
                                  autopct='%1.1f%%', textprops=dict(color="w"), colors=["#8b0000", "#008b8b"])
plt.setp(autotexts, size=18, weight="bold")

ax2.legend(wedges, ["Markdown", "Code"],
          loc="center left",
          bbox_to_anchor=(1, 0, 0.5, 1), prop={'size': 13})

ax2.set_title("What is the most common \n first cell in a notebook?", size=18);

## Probability a cell is code or markdown along with order of cells in notebook

The first cell is often a markdown. From 1st to 20th cell it is more likely to be a code.

In [None]:
most_freq_length = df_counts.tot.value_counts().idxmax()
count_cells = pd.DataFrame(np.zeros((2, most_freq_length)), index=["code", "markdown"])

# Grab all notebooks of average length and look at rank for types
for notebook in df_counts[df_counts.tot == most_freq_length].index:
    
    dt = df.loc[notebook, ["cell_type", "rank"]]
    count_cells.loc["code"] = count_cells.loc["code"].to_numpy() + [dt.cell_type == "code"]
    count_cells.loc["markdown"] = count_cells.loc["markdown"].to_numpy() + [dt.cell_type == "markdown"]

cell_ratios = count_cells / np.sum(count_cells, axis=0)


# -- Plot
fig, ax = plt.subplots(1, figsize=(10, 15))

ax.barh(range(most_freq_length), np.flip(cell_ratios.loc["code"]), height=0.9, color="#008b8b", label="Code")
ax.barh(range(most_freq_length), np.flip(cell_ratios.loc["markdown"]), left=np.flip(cell_ratios.iloc[0]), 
         height=0.9, color="#8b0000", label="Markdown")

ax.set_yticks(range(21))
ax.set_yticklabels(np.flip([f"Cell {x}" for x in range(21)]));

ax.set_title("Distributions of cell types for the most \n common notebook length", size=20)
ax.legend(loc="upper center", frameon=False, prop={'size': 13}, ncol=2)

ax.plot(np.ones(most_freq_length+1)*0.5, np.arange(-0.5, most_freq_length+0.5, 1), linestyle="--", c="r", alpha=0.4)

for i in range(most_freq_length):
    
    dt_code = cell_ratios.loc["code"][most_freq_length-1-i]
    dt_mkd = cell_ratios.loc["markdown"][most_freq_length-1-i]
    
    if dt_code>dt_mkd:
        ax.annotate(f"{np.round(dt_code*100)}%", (dt_code/2, i-0.12), c="w", size=14)
    else:
        ax.annotate(f"{np.round(dt_mkd*100)}%", (dt_code + dt_mkd/2, i-0.12), c="w", size=14)
# Hide axis spine, ticks
for txt in ["top","right","left","bottom"]:
        ax.spines[txt].set_visible(False)
ax.xaxis.set_visible(False)

## Percentage of code cells in notebook
The histogram show that a notebook often has more than 40% of cell is code.

In [None]:
ratio = df_counts.sort_values("ratio", ascending=False).ratio

fig, axs = plt.subplots(2, figsize=(10,10), gridspec_kw={"height_ratios": [7, 1]})
fig.subplots_adjust(hspace=0.2)

# Histogram
axs[0].hist(ratio, bins=200, color="#008b8b");
axs[0].set_title("The Percentage of Code Cells in Notebook", size=20)
axs[0].set_ylabel("Frequency", weight="bold", size=16)
# Hide axis spine, ticks
for txt in ["top","right","left","bottom"]:
        axs[1].spines[txt].set_visible(False)
axs[1].xaxis.set_visible(False)
axs[1].yaxis.set_visible(False)

## Number of code cells and markdown cells in a notebook
A notebook often has about less than 100 cells of code and less than 100 cells of markdown

In [None]:
fig, ax = plt.subplots(1, figsize=(10,10))
ax.scatter(df_counts.markdown_count, df_counts.code_count, s=df_counts.tot, c="#8b0000", zorder=3, alpha=0.5)

ax.set_xlabel("Markdown cells", size=16, weight="bold")
ax.set_ylabel("Code cells", size=16, weight="bold")
ax.grid(zorder=0)
ax.set_title("Code v.s Markdown", size=24);

## Percentage of code cells


In [None]:
fig, ax = plt.subplots(1, figsize=(10,10))
tot = df_counts.sort_values("ratio", ascending=False).tot
ax.scatter(ratio, tot, s=tot, alpha=0.1, color="#008b8b")
ax.set_title("Percentage of Code Cells \n  against Length of Notebook", size=18);

## Length of cells
The length of cells are often less than 10000 characters. There are some having length more than 10000. Markdown often has long cell more than code


In [None]:
# Add length to df
df["length"] = [len(x) for x in df.source]

# Add length to count df
df_counts["avg_len"] = df.groupby("id").mean("length")
df_counts["avg_len_code"] = df[df.cell_type == "code"].groupby("id").mean("length")
df_counts["avg_len_mkd"] = df[df.cell_type == "markdown"].groupby("id").mean("length")
# For plot
code_length =  df[df["cell_type"]=="code"].sort_values("length", ascending=False).length.values
mkd_length =  df[df["cell_type"]=="markdown"].sort_values("length", ascending=False).length.values

fig, axs = plt.subplots(1, figsize=(14, 4))

axs.boxplot(code_length, positions=[2], vert=False, patch_artist=True, boxprops=dict(facecolor="red", color="red"),
             flierprops=dict(markeredgecolor="#008b8b"))
axs.set_title("Comparing length of Cells", size = 20)


box = axs.boxplot(mkd_length, positions=[1], vert=False, patch_artist=True, boxprops=dict(facecolor="red", color="red"),
             flierprops=dict(markeredgecolor="#8b0000"));

axs.set_yticks([1, 2], ["Markdown", "Code"], size=15);

## Compare length of code and markdown
Code cells often have length longer than markdown cells if the length smaller than 1000.

In [None]:
# Same plot as above but limited x-range to (0, 1000)

code_length =  df[df["cell_type"]=="code"].sort_values("length", ascending=False).length.values
mkd_length =  df[df["cell_type"]=="markdown"].sort_values("length", ascending=False).length.values

fig, axs = plt.subplots(1, figsize=(14, 4))

bplot1 = axs.boxplot(code_length, positions=[2], vert=False, patch_artist=True, widths= 0.4, 
                     boxprops=dict(facecolor="#008b8b", color="black", linewidth=3),
                     capprops=dict(linestyle='-', linewidth=3),
                     whiskerprops=dict(linestyle='-', linewidth=3),
                     medianprops=dict(linestyle='-', linewidth=3), showfliers=False)

bplot2 = axs.boxplot(mkd_length, positions=[1], vert=False, patch_artist=True, widths= 0.4, 
                     boxprops=dict(facecolor="#8b0000", color="black", linewidth=3),
                     capprops=dict(linestyle='-', linewidth=3),
                     whiskerprops=dict(linestyle='-', linewidth=3),
                     medianprops=dict(linestyle='-', linewidth=3), showfliers=False)
axs.set_xlim(0, 1000)
axs.set_yticks([1, 2], ["Markdown", "Code"], size=15)
axs.set_title("Comparing length of Cells", size = 20);

## Average length of code cells and markdown cells along with notebooks length

In [None]:
fig = plt.figure(figsize=(13,15))
fig.subplots_adjust(hspace=0.2)
spec = fig.add_gridspec(2, 2)

ax0 = fig.add_subplot(spec[0, :])
ax0.scatter(df_counts.tot, df_counts.avg_len_mkd, c="#8b0000", zorder=3, alpha=0.5, label="Markdown")
ax0.scatter(df_counts.tot, df_counts.avg_len_code, c="#008b8b", zorder=3, alpha=0.5, label="Code")
ax0.set_xlabel("Length of Notebook", size=16, weight="bold")
ax0.set_ylabel("Avg. length of Cells", size=16, weight="bold")
ax0.legend()

ax1 = fig.add_subplot(spec[1, 0])
ax1.scatter(df_counts.markdown_count, df_counts.avg_len_mkd, c="#8b0000", zorder=3, alpha=0.5)
ax1.set_xlabel("# of Markdown Cells", size=13, weight="bold")

ax2 = fig.add_subplot(spec[1, 1])
ax2.scatter(df_counts.code_count, df_counts.avg_len_code, c="#008b8b", zorder=3, alpha=0.5)
ax2.set_xlabel("# of Code Cells", size=13, weight="bold");

In [None]:
!pip install langdetect
from langdetect import detect_langs

markdowns_df = df[df["cell_type"]=="markdown"].reset_index()

df_lang = pd.DataFrame(columns=["Count"])
failed_identifications = 0

# Check notebooks
for i, notebook in enumerate(tqdm(markdowns_df.id.unique())):
    
    # Add probs to df
    prob_df = pd.DataFrame(columns=["Prob"])
    
    # Look at text within notebook
    for txt in markdowns_df[markdowns_df.id == notebook].source:
        
        # Normalize a bit
        txt = re.sub(r'[^\w]', ' ', txt).strip()
        
        # Skip too long or too short txt
        if len(txt) > 5000 or len(txt.split(" ")) < 10:
            continue
            
        try:
            # Detect prob
            lang = detect_langs(txt)
            for l in lang:
                if l.lang in prob_df.index:
                    prob_df.loc[l.lang] = l.prob + prob_df.loc[l.lang]
                else:
                    prob_df.loc[l.lang] = l.prob
        except:
            failed_identifications += 1
    
    # Add highest prob. lang in notebook to counter
    if len(prob_df) > 0:
        lang = prob_df.sort_values("Prob", ascending=False).index[0]

        if lang in df_lang.index:
             df_lang.loc[lang, "Count"] = 1 + df_lang.loc[lang, "Count"]
        else:
             df_lang.loc[lang, "Count"] = 1
    
df_lang = df_lang.sort_values("Count", ascending=False)