# Needed Code snippets 
Some code snippets to be familiar with
Loading json:- 

```python 
content=json.load(open("file.json")) 
```

Display Markdown and code in different colors :- 

```python 
print(f"\033[92m Markdown {content}") 
print(f"\033[94m code {content}") 
``` 



# Import

In [None]:
!pip install pycld2

In [None]:
import pandas as pd 
import json 
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt 
import glob 
from tqdm import tqdm
import os
import nltk
from nltk.corpus import stopwords
import pycld2 as cld2

# Configurations

In [None]:
CFG={
    "no_of_samples":20000,
    "train_orders":"../input/AI4Code/train_orders.csv",
    "train":"../input/AI4Code/train"
}

# Loading Data 

Json file format :- 
```json
"root":{
  "cell_type":{
     "cell_id":"markdown/code",
      ...
  },
  "source":{
     "cell_id":"content" ,
      ...
  }
}
```

In [None]:
train_orders=pd.read_csv(CFG["train_orders"])
train_orders.head()

In [None]:
train_orders["full_path"]=train_orders["id"].apply(lambda x:os.path.join(CFG["train"],x+".json"))
train_orders.head()

In [None]:

notebooks=[] 
for notebook_pth in tqdm(train_orders["full_path"][:CFG["no_of_samples"]]): 
    notebooks.append(json.load(open(notebook_pth)))

# Understanding the data at a basic level 


## Structure

In [None]:
notebooks[0].keys()

In [None]:
notebooks[0]["cell_type"]

In [None]:
notebooks[0]["source"]

In [None]:
def get_info(notebook):
    cell_contents=list(notebook["source"].values())
  
    cell_types=list(notebook["cell_type"].values())
    return cell_contents,cell_types 
contents,types=get_info(notebooks[0])

In [None]:
# reading notebooks
for i in range(1,5):
    print("--------------- {} notebook".format(i))
    contents,types=get_info(notebooks[i])
    for type_cell,content in zip(types,contents):
        if type_cell=="markdown":
            print(f"\033[92m {content}")
        else:
            print(f"\033[94m {content}")  

In [None]:
notebook_infos=list(map(get_info,notebooks))

# Cell position analysis before ordering

In [None]:
def get_position_of_stuff(classes):#tells position of markdown and code
    code=[] 
    markdown=[]
    i=0
    for clas in classes:
        if clas=="markdown":
            markdown.append(i/len(classes))
        if clas=="code":
            code.append(i/len(classes))
        i+=1 
    return sum(code)/len(code),sum(markdown)/len(markdown)

In [None]:
get_position_of_stuff(notebook_infos[0][1])

In [None]:
get_position_of_stuff(notebook_infos[1][1])

In [None]:
code_positions=[] 
markdown_positions=[]
for ninfo in notebook_infos:
    pos=get_position_of_stuff(ninfo[1]) 
    code_positions.append(pos[0])
    markdown_positions.append(pos[1])

In [None]:
print("Code position")

print("mean"+str(np.mean(code_positions)))

In [None]:
print("markdown position")

print("mean"+str(np.mean(markdown_positions)))

In [None]:
px.histogram(code_positions,marginal="box")

In [None]:
px.histogram(markdown_positions,marginal="box") 

# We see that the markdown is placed at the last generally while the code before that before ordering lets see what happens after ordering some outliers are however there

In [None]:
def get_info_with_keys(notebook):
    cell_contents=notebook["source"]
  
    cell_types=notebook["cell_type"]
    return cell_contents,cell_types 
contents,types=get_info_with_keys(notebooks[0])

In [None]:
key_with_notebook_infos=list(map(get_info_with_keys,notebooks)) 


In [None]:
order_of_search=list(map(lambda x:x.split(),train_orders["cell_order"])) 
order_of_search[0]

In [None]:
def get_position_of_stuff(classes,according_to):
    code=[] 
    markdown=[]
    i=0
    for clas in according_to:
        if classes[clas]=="markdown":
            markdown.append(i/len(according_to))
        if classes[clas]=="code":
            code.append(i/len(according_to))
        i+=1 
    return sum(code)/len(code),sum(markdown)/len(markdown)
get_position_of_stuff(types,order_of_search[0])

In [None]:
code_positions=[] 
markdown_positions=[]
for ninfo,order in zip(key_with_notebook_infos,order_of_search):
    pos=get_position_of_stuff(ninfo[1],order)
    code_positions.append(pos[0])
    markdown_positions.append(pos[1])

In [None]:
print("Code position")

print("mean"+str(np.mean(code_positions)))

In [None]:
print("markdown position")

print("mean"+str(np.mean(markdown_positions)))

In [None]:
px.histogram(code_positions,marginal="box") 

In [None]:
px.histogram(markdown_positions,marginal="box") 

# We notice that they are placed together that is almost in the same position following a bell curve however in case of markdown some unexpected outliers are found

# Lets see the percentage of markdown in the notebooks

In [None]:
def get_amount(classes):
    return np.sum(np.array(classes)=="markdown")/len(classes)
get_amount(notebook_infos[0][1])

In [None]:
markdown_amounts=[] 
for info in notebook_infos:
    markdown_amounts.append(get_amount(info[1]))

In [None]:
px.histogram(markdown_amounts,marginal="box")

In [None]:
np.mean(markdown_amounts)

In [None]:
px.histogram(1-np.array(markdown_amounts),marginal="box")

In [None]:
np.mean(1-np.array(markdown_amounts))

# Almost 33% of the notebooks are composed of markdowns

# Analyzing Text 



# loading stopwords and seeing the stopword count in the markdown

In [None]:
stopwords=stopwords.words('english')

In [None]:
def stopword_percent(text,stopwords):
    stopword_count=0
    for word in text.split(): 
        if word.lower() in stopwords:
            stopword_count+=1 
    return stopword_count/len(text.split())
stopword_percent("Math is good and my favourite subject",stopwords)

# Filter out the markdown cells

In [None]:
def filter_cells(cells,types,req="markdown"):
    output=[]
    i=0
    for typ in types: 
        if typ==req:
            output.append(cells[i])
        i+=1 
    return output 
filter_cells(notebook_infos[0][0],notebook_infos[0][1])

In [None]:
markdown_percents=[]
for ninfo in tqdm(notebook_infos): 
    filtered_cells=filter_cells(ninfo[0],ninfo[1]) 
    markdown_percents.append(np.mean(list(map(lambda x:stopword_percent(x,stopwords),filtered_cells))))

# Plot the dist

In [None]:
px.histogram(markdown_percents,marginal="box")

# We see that generally 21% of the markdown is composed of stopwords

In [None]:
np.mean(markdown_percents)

# Language detection we see if the language is english or not

In [None]:
def detect_language(text_content):
    _, _, _, detected_language = cld2.detect(text_content,  returnVectors=True)
    return detected_language[0][2]=="ENGLISH"
detect_language("Деление признаков на числовые и текстовые")

In [None]:
detect_language("Machine Learning is imporantant")

In [None]:
lang=[]
for ninfo in tqdm(notebook_infos): 
    filtered_cells=filter_cells(ninfo[0],ninfo[1]) 
    try:
        lang+=list(map(lambda x:detect_language(x),filtered_cells))
    except:
        pass

# In the histogram we see that a huge number of markdowns are not in english almost half as much as the number of markdowns in english

In [None]:
px.histogram(lang)

# In this part I will be focusing on understanding and visualizing the changes in the position after ordering

In [None]:
infos=key_with_notebook_infos 

In [None]:
origin=[] 
finishing=[]
for i in list(tqdm(range(len(infos)))):
    subo=[] 
    subf=[]
    for id_cell in infos[i][0].keys(): 
        subo.append((list((infos[i][0].keys())).index(id_cell)/len(infos[i][0].values()),1))
        subf.append((order_of_search[i].index(id_cell)/len(infos[i][0].values()),0))
    origin.append(subo) 
    finishing.append(subf)

In [None]:
color_mapping={
    "code":"blue",
    "markdown":"green"
}

In [None]:
idx=list(range(len(infos[0][0].keys())))
for i in idx:
    color=color_mapping[list(infos[0][1].values())[i]]
    plt.arrow(x=origin[0][i][0],y=origin[0][i][1],dx=finishing[0][i][0]-origin[0][i][0],dy=finishing[0][i][1]-origin[0][i][1],head_width = 0.07,color=color)
plt.xlim(0,1)
plt.ylim(0,1)

In [None]:
idx=np.array(list(range(len(infos[1][0].keys()))))
for i in idx:
    color=color_mapping[list(infos[1][1].values())[i]]
    plt.arrow(x=origin[1][i][0],y=origin[1][i][1],dx=finishing[1][i][0]-origin[1][i][0],dy=finishing[1][i][1]-origin[1][i][1],head_width = 0.07,color=color)
plt.xlim(0,1)
plt.ylim(0,1)

In [None]:
idx=np.array(list(range(len(infos[2][0].keys()))))
for i in idx:
    color=color_mapping[list(infos[2][1].values())[i]]
    plt.arrow(x=origin[2][i][0],y=origin[2][i][1],dx=finishing[2][i][0]-origin[2][i][0],dy=finishing[2][i][1]-origin[2][i][1],head_width = 0.07,color=color)
plt.xlim(0,1)
plt.ylim(0,1)

In [None]:
idx=np.array(list(range(len(infos[3][0].keys()))))
for i in idx:
    color=color_mapping[list(infos[3][1].values())[i]]
    plt.arrow(x=origin[3][i][0],y=origin[3][i][1],dx=finishing[3][i][0]-origin[3][i][0],dy=finishing[3][i][1]-origin[3][i][1],head_width = 0.07,color=color)
plt.xlim(0,1)
plt.ylim(0,1)

In [None]:
idx=np.array(list(range(len(infos[4][0].keys()))))
for i in idx:
    color=color_mapping[list(infos[4][1].values())[i]]
    plt.arrow(x=origin[4][i][0],y=origin[4][i][1],dx=finishing[4][i][0]-origin[4][i][0],dy=finishing[4][i][1]-origin[4][i][1],head_width = 0.07,color=color)
plt.xlim(0,1)
plt.ylim(0,1)

In [None]:
idx=np.array(list(range(len(infos[5][0].keys()))))
for i in idx:
    color=color_mapping[list(infos[5][1].values())[i]]
    plt.arrow(x=origin[5][i][0],y=origin[5][i][1],dx=finishing[5][i][0]-origin[5][i][0],dy=finishing[5][i][1]-origin[5][i][1],head_width = 0.07,color=color)
plt.xlim(0,1)
plt.ylim(0,1)

# The above graphs explain the mapping from the orginal notebook to the ordered notebook , each point which has y=1 that points x axis represents the position of that cell before ordering and its ordered position is when y=0 then we draw a line between those two positions markdown cells in green and code cells in blue.We are quite aware and have a strong hypotehsis that markdowns go to starting or together with the code cells at this point

# Measuring the amount of change after ordering 

A bit of innaccurate way of doing this as changing one cell also effects the next cell and all the cells after that but as for now i am using the following . 

$$dP=\frac{1}{n}*\sum_{i=1}^{n}| {F_i}_x-{O_i}_x |$$ 

Where ${F_i}_x$ is the position after ordering and ${O_i}_x$ is the position before ordering and then we ofcourse take the absolute of the difference as we want measure the euclid distance .  A similar method was also used to create those graphs. n is the number of cells and then we take an average which indicates the amount of change

In [None]:
def get_distance(origins,finished): 
    return np.mean(np.abs(finished[...,0]-origins[...,0])) 
get_distance(np.array(origin[0]),np.array(finishing[0]))

In [None]:
distances=[]
for O,S in zip(origin,finishing): 
    distances.append(get_distance(np.array(O),np.array(S)))

In [None]:
px.histogram(distances)

In [None]:
np.mean(distances)

# So my inital guess is almost 23% of code is reordered

# Thank you :) 


Updates will be coming to this notebook . 
Mind giving me feedbacks that will help me improve this kernel .
Upvote if u like this notebook