In [9]:
## imoprts 
import os 
import re
import json
import pandas as pd
from markdown import markdown
from bs4 import BeautifulSoup
from tqdm import tqdm

## input directory 
input_dir = "KT_dataset/"
filenames =  os.listdir(input_dir)
print("Total Notebooks in the dataset = ", len(filenames))

## utility functions
def check_df(df):
    try:
        out = len(df.cell_type.value_counts().index) == 2 and df.cell_type.value_counts().loc["code"] >=1 and df.cell_type.value_counts().loc["markdown"] >=1
    except:
        out = False
    return out

def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """
    html = markdown(markdown_string)
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(text=True))
    return text

def preprocess_text(text):
    if type(text) == str:
        return text
    else:
        return " ".join(text)
    
## generating data
data = pd.DataFrame(data = [])
count = 0
for i in tqdm(range(len(filenames))):
    filename = filenames[i]
    try:
        ## loading notebook
        notebook = json.load(open(input_dir + filename))
    except:
        ## invlalid files
        continue
        
    df = pd.DataFrame(notebook["cells"], columns = ["source","cell_type"])

    if check_df(df):
        df.dropna(inplace = True)
        df["rank"] = df.index
        df["pct_rank"] = df.index/ len(df)
        df["source"] = df["source"].apply(preprocess_text)
        df["source"] = df["source"].apply(markdown_to_text)
        df["notebook_id"] = i
        data = pd.concat([data, df])
        count += 1

print("Total notebooks used for generating dataset = ", count)
data["rank"] = data["rank"].astype("int")
data["notebook_id"] = data["notebook_id"].astype("int")      
data.to_csv("custom_gen_data.csv", index = False)

Total Notebooks in the dataset =  248761


100%|███████████████████████████████████████████████████████████████████████| 248761/248761 [14:50:16<00:00,  4.66it/s]


Total notebooks used for generating dataset =  168787
