In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip drive/MyDrive/AI4Code/AI4Code

In [None]:
import json

In [None]:
!pip install tensorflow
!pip install transformers
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-h

In [None]:
!pip install torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
data = pd.read_csv('train_orders.csv')

In [None]:
data.head()

Unnamed: 0,id,cell_order
0,00001756c60be8,1862f0a6 448eb224 2a9e43d6 7e2f170a 038b763d 7...
1,00015c83e2717b,2e94bd7a 3e99dee9 b5e286ea da4f7550 c417225b 5...
2,0001bdd4021779,3fdc37be 073782ca 8ea7263c 80543cd8 38310c80 0...
3,0001daf4c2c76d,97266564 a898e555 86605076 76cc2642 ef279279 d...
4,0002115f48f982,9ec225f0 18281c6c e3b6b115 4a044c54 365fe576 a...


In [None]:
with open('train/' + data.id.iloc[1] + '.json') as f:
  doc1 = json.load(f)

Here we write some functions to print out code and markdown of the training data. They take as input the document index and output a list of strings corresponding to each notebook cell in the correct order.

In [None]:
def get_markdown(doc_index):
  with open('train/' + data.id.iloc[doc_index] + '.json') as f:
    doc = json.load(f)
  cell_order = data.cell_order.iloc[doc_index].split()
  return [doc['source'][cell] for cell in cell_order if doc['cell_type'][cell] == 'markdown']

def get_code(doc_index):
  with open('train/' + data.id.iloc[doc_index] + '.json') as f: 
      doc = json.load(f)  
  cell_order = data.cell_order.iloc[doc_index].split()
  return [doc['source'][cell] for cell in cell_order if doc['cell_type'][cell] == 'code']

def get_notebook(doc_index):
  with open('train/' + data.id.iloc[doc_index] + '.json') as f:
    doc = json.load(f)
  cell_order = data.cell_order.iloc[doc_index].split(' ')
  return [doc['source'][cell] for cell in cell_order]
    

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")




Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

In [None]:
!pip install sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from sklearn.model_selection import train_test_split

As a baseline, we can try a bag of words approach with a random forest classifier. We can treat the problem as a pairwise matching problem, where a code cell and markdown cell are a match if the code cell immediately follows the markdown cell. If we want even greater a more balanced set of categories, we could call a code + markdown cell a match if the code cell follows the markdown cell at all. Not sure if this would help though. 

We can do bag of words on the CodeBert tokens and take the difference between the markdown vector and code vector and feed the result through a random forest.

We will start with a small training set just to see how long things take.

In [None]:
ancestor_data = pd.read_csv('train_ancestors.csv')

In [None]:
no_parents = ancestor_data[ancestor_data.parent_id.isnull()].id.values

In [None]:
parent_counts = ancestor_data[ancestor_data.parent_id == np.nan]

In [None]:
train, test = train_test_split(no_parents, test_size = .95)

In [None]:
len(train)

5985

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

In [None]:
tf_idf = TfidfVectorizer()

Below is some preprocessing for training a neural network which would use a transformer as a first layer and take as input a pair of cells and output 1 or 0 depending on whether the first cell should precede the second.

In [None]:
X = []
y = []
for doc_index in range(500):
  print(doc_index)
  with open('train/' + data.id.iloc[doc_index] + '.json') as f:
    doc = json.load(f)
    cell_order = data.cell_order.iloc[doc_index].split(' ')
    cells = [tokenizer(doc['source'][cell], return_tensors="pt", truncation=True, padding = 'max_length' ) for cell in cell_order]
    X += [(cell1, cell2) for cell1 in cells for cell2 in cells]
    y += [i < j for i in range(len(cell_order)) for j in range(len(cell_order))]

In [None]:
def print_keywords(doc_index, n, cell_type):
  '''
  Prints the top n keywords according to tf-idf at the document level. cell_type should be code or markdown
  '''

  with open('train/' + data.id.iloc[doc_index] + '.json') as f:
    doc = json.load(f)
    cell_order = data.cell_order.iloc[doc_index].split(' ')
    cell_strings = [' '.join(tokenizer.tokenize(doc['source'][cell])) for cell in cell_order if doc['cell_type'][cell] == cell_type]


  tf_idf.fit(cell_strings)
  response = tf_idf.transform(cell_strings)
  feature_array = np.array(tf_idf.get_feature_names())
  tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]

  top_n = feature_array[tfidf_sorting][:n]
  print(top_n)

In [None]:
print_keywords(1, 200, 'code')

['desc' 'ribe' 'final' 'test' 'ġzip' 'kernel' 'le' 'layout' 'lane' 'label'
 'lab' 'kw' 'ks' 'kind' 'keys' 'jun' 'learn' 'jpg' 'join' 'ize' 'ixels'
 'ive' 'itle' 'itions' 'ition' 'iter' 'ist' 'isson' 'lear' 'lect' 'is'
 'ma' 'medium' 'med' 'mean' 'max' 'matrix' 'matically' 'math' 'mat' 'mask'
 'mas' 'map' 'lr' 'len' 'loss' 'log' 'loc' 'list' 'lins' 'lines' 'line'
 'limit' 'lighting' 'lib' 'length' 'ise' 'ipped' 'mer' 'ics' 'il' 'iguous'
 'ignore' 'igma' 'igm' 'ify' 'iform' 'if' 'ie' 'id' 'iction' 'ic' 'ip'
 'ian' 'http' 'html' 'hs' 'hl' 'height' 'heat' 'head' 'he' 'group' 'grid'
 'ile' 'im' 'image' 'images' 'ior' 'ions' 'ional' 'ion' 'io' 'inv' 'inter'
 'intensity' 'int' 'ins' 'input' 'ink' 'init' 'ings' 'ing' 'ine' 'index'
 'inc' 'in' 'ims' 'improve' 'import' 'img' 'ment' 'missing' 'min' 'pal'
 'place' 'pl' 'pixel' 'pi' 'pers' 'percent' 'pect' 'pe' 'pd' 'path' 'par'
 'pad' 'plot' 'pace' 'over' 'output' 'ours' 'ots' 'otation' 'oss' 'osc'
 'osa' 'os' 'ortion' 'ple' 'po' 'gr' 'processing'

