In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
from tqdm import tqdm
import glob
import plotly.express as px


In [None]:
train_order = pd.read_csv('../input/AI4Code/train_orders.csv')
train_order.index = train_order.id
train_order.drop(columns=['id'])

In [None]:
t_order = {}
for i,order in zip(train_order.id,train_order.cell_order):
    t_order[i] = order

#### Just going to randomy use 50000 notebooks for EDA.

In [None]:
notebooks = {}

for csv in tqdm(np.random.choice(glob.glob('../input/AI4Code/train/*'),50000,replace=False)):
    notebooks[os.path.split(csv)[-1][:-5]] = pd.read_json(csv)

In [None]:
for k in tqdm(notebooks):
    notebooks[k] = notebooks[k].loc[train_order.loc[k]['cell_order'].split(" ")]
    notebooks[k]['cell_rank'] = range(1,1+len(notebooks[k]))

In [None]:
len(notebooks)

In [None]:
combined = pd.concat(notebooks.values())

## Languages used in notebook

In [None]:
#https://stackoverflow.com/a/47106810/7429803

!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz
import fasttext
model = fasttext.load_model('lid.176.ftz')
print(model.predict('الشمس تشرق'))

In [None]:
language = []
for k in tqdm(notebooks):
    notebooks[k]['language'] = "Python"
    notebooks[k].loc[notebooks[k].cell_type=="markdown",'language'] = notebooks[k][notebooks[k].cell_type=="markdown"].source.apply(func=lambda x: model.predict(x.replace("\n","."))[0][0]) 
    language.append(notebooks[k][notebooks[k].cell_type=="markdown"].language.mode()[0])
px.bar(pd.Series(language).value_counts()[:10],title="Notebook Language distribution(top 10)")

#### Number of languages in training subset: 57
#### Most popular languages: english, portugese,russian, korean, japanese..
#### Percentage of Notebooks written in English: 92.286%

### Distribution of length of notebooks

In [None]:
px.histogram([len(notebooks[k]) for k in notebooks],title="Distribution of Cells (Markdown and Code)",nbins=200)

In [None]:
px.histogram([len(notebooks[k][notebooks[k].cell_type=="markdown"]) for k in notebooks],title="Distribution of Markdown Cells",nbins=200)

In [None]:
px.histogram([len(notebooks[k][notebooks[k].cell_type=="code"]) for k in notebooks],title="Distribution of Code Cells",nbins=200)

#### percentage of notebooks with no code in training subset:0
#### Percentage of Notebooks with no markdown  in training subset: 0

### Ratio of Type of Cells

In [None]:
px.bar(combined.cell_type.value_counts(),title="cell type")

### Ratio of markdown:Code in the nth Cell

In [None]:
ctype= combined.groupby(['cell_rank','cell_type']). count().reset_index()
ctype = ctype[:300]
px.bar(ctype,x='cell_rank',y='source',color='cell_type',title="Proportion of markdown and Code for each cell rank")

### Code cells, Code lines with comments

In [None]:
combined['has_comments'] = False
combined['lines_with_comments'] = 0
combined['number_lines'] = 0

combined.loc[combined.cell_type=="code",'has_comments'] = combined.source[combined.cell_type=="code"].apply(lambda x: "#" in x)
combined.loc[combined.cell_type=="code",'lines_with_comments'] = combined.source[combined.cell_type=="code"].apply(lambda x: len([x for x in x.split('\n') if "#" in x]))
combined.loc[combined.cell_type=="code",'number_lines'] = combined.source[combined.cell_type=="code"].apply(lambda x: len(x.split('\n')))


In [None]:
tmp = combined[combined.cell_type=="code"]
tmp = tmp.number_lines[tmp.number_lines<50]
print(f"Maximum lines: {combined.number_lines.max()}")
px.histogram(tmp,title="Number of lines in code cells",nbins=100)

In [None]:
tmp = combined[combined.cell_type=="code"]
tmp = tmp[tmp.lines_with_comments>0]
px.histogram(tmp.lines_with_comments/(tmp.number_lines+1e-10),title="distribution of ratio of lines with comments in code cells(0 has been omitted)",nbins=100)

In [None]:
px.bar(combined.has_comments[combined.cell_type=="code"].value_counts(),title="Code cells without any comments(false)/ at least 1 comment(true)")