In [None]:
# !pip install spacy_language_detection

# Generating a New Dataset

<p>
    The data provided by the competition host takes too long to read it all. It took more than 10 hours just read and concat all the files. Thus, this notebook is used to generate a new dataset in the parquet format to quickly read the data. It is impossible to work in the competition if everytime it takes 10 hours to just read the data. Moreover, I added some features which I think may help in matching markdown with code cells.
</p>

<p> 
    A lot of the code is commented to avoid duplicates because the notebook was runned many times.
</p>

<p>
    I used the help of 2 kaggle notebooks (with some extra search) to make the scripts:
    <ul>
        <li><a href='https://www.kaggle.com/code/ryanholbrook/getting-started-with-ai4code'>Getting Started with AI4Code</a></li>
        <li><a href='https://www.kaggle.com/code/andradaolteanu/ai4code-language-detection-and-model-tuning'>AI4Code - Language Detection and Model Tuning</a></li>
    </ul>
</p>

<p>
    This notebook generates the <a href='https://www.kaggle.com/datasets/fmakarem/ai4code-train'>AI4Code Train Dataset</a> which contains the dataframes used in this notebook.
</p>
    

In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm
import os
import re
import time
import json

import spacy
from spacy.language import Language
# from spacy_language_detection import LanguageDetector

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

# data_dir = Path('../input/AI4Code')
data_path='../input/ai4code-train/train_with_features.parquet'
language_path='../input/ai4code-train/language_mapping.parquet'

### Regex:

In [None]:
variable_regex='\w[\d\w_]*(?=\s*=|\s*,)'
method_regex='(?<=\w\.)\w[\d\w_]*'
class_regex='(?<=class\s)\w[\d\w_]*(?=\s*:|\s*\()'
func_regex='(?<=def\s)\w[\d\w_]*(?=\s*:|\s*\()'
import_regex='(?<=import\s)\w+(?=\W)|(?<=from\s)\w+(?=\simport)'

variable_pattern=re.compile(variable_regex)
method_pattern=re.compile(method_regex)
class_pattern=re.compile(class_regex)
func_pattern=re.compile(func_regex)
import_pattern=re.compile(import_regex)

In [None]:
patterns=[('variable',variable_pattern),
          ('method',method_pattern),
          ('class',class_pattern),
          ('func',func_pattern),
          ('import',import_pattern)
         ]

In [None]:
# paths_train = list((data_dir / 'train').glob('*.json'))
# features_df=pd.DataFrame()

# for path in paths_train:
#     nb_id=path.stem
#     print(nb_id)
#     with open(path) as file: 
#         print(path)
#         data=json.load(file)
#         print(data['cell_type'])
#         print(data['source'])
#     break

In [None]:
# def read_notebook(path):
#     df_row=(
#         pd.read_json(
#             path,
#             dtype={'cell_type': 'category', 'source': 'str'})
#         .assign(id=path.stem)
#         .rename_axis('cell_id')
#     )
    
#     return df_row


# paths_train = list((data_dir / 'train').glob('*.json'))#[:10]

# notebooks_train = [
#     read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
# ]
# print('started concatenation.')
# df = (
#     pd.concat(notebooks_train)
#     .set_index('id', append=True)
#     .swaplevel()
#     .sort_index(level='id', sort_remaining=False)
# )

# print('finished reading the data')

# df

In [None]:
# for pattern_name, compiled_pattern in patterns:
#     df[pattern_name]=df_row.apply(lambda row: ','.join(compiled_pattern.findall(row['source'])) if row['cell_type']=='code' else np.nan,axis=1)

### Read the Data

In [None]:
data_df=pd.read_parquet(data_path)
print('data_df successful')
language_df=pd.read_parquet(language_path)
print('language_df successful')
order_df=pd.read_csv('../input/AI4Code/train_orders.csv')
print('order_df successful')

In [None]:
data_df.head()

In [None]:
language_df.head()

In [None]:
order_df.head()

## **Features**
#### The code below uses the regex expressions to generate some extra features.

In [None]:
count=0
nb_with_duplicate_cells={}

data_df['has_duplicates']=False

for group_index,group_df in data_df.groupby('id'):
    duplicated_cells=group_df[group_df['source'].duplicated(keep=False)]
    if len(duplicated_cells):
        nb_with_duplicate_cells[group_index]=1
        data_df.loc[duplicated_cells.index,['has_duplicates']]=True
        count+=1
    else:
        nb_with_duplicate_cells[group_index]=0
#         df.loc[group_index,['has_duplicates']]=False
    
count

In [None]:
nb_with_duplicate_cells_df=pd.DataFrame.from_dict(nb_with_duplicate_cells,orient='index').rename(columns={0:'has_duplicates'})

In [None]:
nb_with_duplicate_cells_df

In [None]:
# df_copy=df.copy()
# start_time=time.time()

# print('started with the patterns')

# patterns=[('variable',variable_pattern),
#           ('method',method_pattern),
#           ('class',class_pattern),
#           ('func',func_pattern),
#           ('imported',import_pattern)
#          ]

# for pattern_name, compiled_pattern in tqdm(patterns, desc='Feature Extraction'):
#     df_copy[pattern_name]=df_copy.apply(lambda row: ','.join(compiled_pattern.findall(row['source'])) if row['cell_type']=='code' else np.nan,axis=1)
#     print(f'finished {pattern_name}\n')
    
# print('it took: ',time.time()-start_time)
# df_copy

## **Language**

#### The code below is used to detect the language of the code cells using Spacy.

In [None]:
# Language Detector Function
# def get_lang_detector(nlp, name):
#     return LanguageDetector()

# nlp_model = spacy.load("en_core_web_sm")

# Language.factory("language_detector", func=get_lang_detector)
# nlp_model.add_pipe('language_detector', last=True)

In [None]:
# def apply_language_detection(row: pd.Series,nlp=nlp_model,num_char=100):
#     string_type=row['cell_type']
    
#     if 'markdown' in string_type.lower():
#         text=row['source']
#         doc = nlp(text[:num_char])
#         return doc._.language
#     else:
#         return np.nan

In [None]:
# id_to_lang={}

# i=0
# print('start ID to language mapping')
# for key, group_df in df_copy.groupby('id'):
#     markdown_df=group_df[group_df['cell_type']=='markdown']
    
#     if len(markdown_df):
#         source=markdown_df.reset_index().loc[0]
    
#     language=apply_language_detection(source)
    
#     if i<5: 
#         print(f'language: {language["language"]} score: {language["score"]}')
        
#         i+=1
    
#     id_to_lang[key]=language

# print('finished mapping')

In [None]:
# id_to_lang={i:f'{i}-th' for i in range(20)}
# print('length: ',len(id_to_lang))
# key=list(id_to_lang.keys())[0]
# print('Sample: ',key, '--', id_to_lang[key])

In [None]:
# df_copy.copy().reset_index().apply(lambda row: str(id_to_lang[row['id']]['language']),axis=1)

In [None]:
# id_to_lang_df=pd.DataFrame.from_dict(id_to_lang,orient='index')

In [None]:
# print('add the language alongside each column')
# df_copy=df_copy.drop(columns=['lang'])
# language_values=df_copy.copy().reset_index().apply(lambda row: id_to_lang[row['id']]['language'],axis=1)

## **Order of cells**

In [None]:
if 'cell_order' not in data_df.columns:
    print("'cell_order' not in data_df columns")
    order_dict={row['id']:row['cell_order'].split() for index, row in order_df.iterrows()}

    j=0
    for key, val in order_dict.items():
        if j == 2:
            break
        print(f'key: {key}')
        print(f'val: {val}')
        j+=1

    print(order_dict['00001756c60be8'].index('2a9e43d6'))

    cell_order=data_df.apply(lambda row: order_dict[str(row.name[0])].index(str(row.name[1])),axis=1)
    data_df.loc[data_df.index,['cell_order']]=cell_order
    assert not pd.isna(data_df['cell_order']).any(), "There is a notebook cell without a 'cell_order' value which is wrong."
else:
    print("'cell_order' is in data_df columns")

## **Save The Data**

#### Save the data in parquet format

In [None]:
# df.to_parquet('preprocessed_train.parquet')
data_df.to_parquet('train_with_features.parquet')
language_df.to_parquet('language_mapping.parquet')
nb_with_duplicate_cells_df.to_parquet('nb_with_duplicate_cells.parquet')