# Vectorizing code/markdown using CodeBert
This notebook contains the code I used to vectorize the code and markdown cells of the notebooks in the training set. The extracted feature vectors can then be used for various downstream tasks.  
Link to dataset(60% of training data): https://www.kaggle.com/datasets/samratthapa/codebert-ai4code-features

Each file corresponds to a notebook in the training dataset.Each file contains the feature vectors of the topmost 256 cells of the corresponding notebook. 

In [None]:
# %% [code] {"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2022-05-31T09:31:33.409460Z","iopub.execute_input":"2022-05-31T09:31:33.410190Z","iopub.status.idle":"2022-05-31T09:31:40.576230Z","shell.execute_reply.started":"2022-05-31T09:31:33.410092Z","shell.execute_reply":"2022-05-31T09:31:40.575454Z"}}
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import AutoTokenizer, AutoModel
import os
from torch.utils.data import Dataset,DataLoader
import glob
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
import fasttext
from tqdm import tqdm

device = torch.device("cuda")
print(torch.version.cuda)

In [None]:
train_order = pd.read_csv('../input/AI4Code/train_orders.csv')
train_order.index = train_order.id
train_order.drop(columns=['id'])
t_order = {}
for i,order in zip(train_order.id,train_order.cell_order):
    t_order[i] = order

all_notebooks = {}

for csv in glob.glob('../input/AI4Code/train/*'):
    all_notebooks[os.path.split(csv)[-1][:-5]] = csv

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
encoder_model = RobertaModel.from_pretrained("microsoft/codebert-base")
encoder_model = encoder_model.to(device)

In [None]:
os.makedirs('features',exist_ok=True)
for enum,idx in enumerate(tqdm(all_notebooks)):
    if enum%1000==0:
        print(enum)
    notebook_id = os.path.split(all_notebooks[idx])[-1][:-5]
    notebook = pd.read_json(all_notebooks[idx])
    cell_order = t_order[notebook_id].split(" ")
    notebook = notebook.loc[cell_order].reset_index()
    notebook = notebook[:256]
    source_code= notebook.source.to_list()
    tokens = tokenizer(source_code,padding='longest',truncation='longest_first',return_tensors='pt',max_length=512)
    # print(tokens['input_ids'].size())
    with torch.no_grad():
        _,output = encoder_model(input_ids=tokens['input_ids'].to(device),attention_mask=tokens['attention_mask'].to(device),return_dict=False)
    torch.save(output,os.path.join("./features",notebook_id+".cuda"))    