# AI4Code EDA

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_df=pd.read_parquet('../input/ai4code-train/train_with_features.parquet')
print('train_df successful')
language_mapping=pd.read_parquet('../input/ai4code-train/language_mapping.parquet')
print('language_mapping successful')
order_df=pd.read_csv('../input/AI4Code/train_orders.csv')
print('order_df successful')
nb_with_duplicate_cells_df=pd.read_parquet('../input/ai4code-train/nb_with_duplicate_cells.parquet')
print('nb_with_duplicate_cells_df successful')

## Load the data and take a pick into the columns

In [None]:
train_df.head()

In [None]:
print(len(language_mapping))
language_mapping.head()

In [None]:
order_df.head()

In [None]:
order_dict={row['id']:row['cell_order'].split() for index, row in order_df.iterrows()}

In [None]:
print(len(nb_with_duplicate_cells_df))
nb_with_duplicate_cells_df.head()

## Data Analysis

In [None]:
markdown_mean=np.mean(train_df['cell_type']=='markdown')
print(f'mean number of markdown cells: {markdown_mean*100}%')

sns.countplot(x='cell_type',data=train_df)

In [None]:
print(f'mean number of notebooks with duplicates: {nb_with_duplicate_cells_df["has_duplicates"].mean()*100}%')

sns.countplot(x='has_duplicates',data=nb_with_duplicate_cells_df)

I expected the mean number of markdown cells to be higher for the notebooks with duplicates than the whole dataset. However, it was higher for the whole dataset. After checking the duplicated cells (at the end of the notebook). I think most notebooks that have duplicated cells were lazy and did not want to write a lot of markdown cells.

In [None]:
english_mean=np.mean(language_mapping['language']=='en')
# print(f'mean number of english nbs: {english_mean*100}%')

data=[english_mean,1-english_mean]
keys=['english','other']
startangle=90
shadow=True
explode=[0.1,0]
radius=1.9

# define Seaborn color palette to use
palette_color = sns.color_palette('bright')
  
# plotting data on chart
plt.pie(data, labels=keys, colors=palette_color,radius=radius, autopct='%.1f%%',shadow=shadow,explode=explode,startangle=startangle)

plt.title('Proportion of english notebooks knowing the notebook have duplicates',pad=80)
# displaying chart
plt.show()

In [None]:
plt.figure(figsize=(25,5))
sns.countplot(x='language',data=language_mapping)

<p>
    Dealing with notebooks containing duplicate cells is tricky. Misordering a notebook affects the order of all other cells. Moreover, depending on how the selected model works, the model can get confused on predictions since 2 exact inputs have different positions. Do these cells come right after each other in the notebooks? what about removing one and putting it either always before/after the duplicate?
</p>

In [None]:
tmp_df=nb_with_duplicate_cells_df.join(language_mapping)
tmp_df=tmp_df[tmp_df['has_duplicates']==1]

assert len(tmp_df)==np.sum(tmp_df['has_duplicates']==1), 'There is something wrong'

english_mean=np.mean(tmp_df['language']=='en')

data=[english_mean,1-english_mean]
keys=['english','other']
startangle=90
shadow=True
explode=[0.1,0]
radius=1.9

# define Seaborn color palette to use
palette_color = sns.color_palette('bright')
  
# plotting data on chart
plt.pie(data, labels=keys, colors=palette_color,radius=radius, autopct='%.1f%%',shadow=shadow,explode=explode,startangle=startangle)

plt.title('Proportion of english notebooks knowing the notebook have duplicates',pad=80)

# displaying chart
plt.show()


<p>
    The proprotion of english markdown cells is equal in both, the whole dataset and the dataset having duplicate code cells.
</p>

<p>
    Next I need to check the if duplicate cells are alongside each other or distributed through out the notebook.

In [None]:
plt.figure(figsize=(25,5))
sns.countplot(x='language',data=tmp_df)

In [None]:
#Get the markdown cells of notebooks with duplicate cells
markdown_df=train_df[(train_df['cell_type']=='markdown')].copy()
markdown_df=markdown_df[markdown_df['has_duplicates']==True]
print('length of markdown cells',len(markdown_df))

id_to_dup_order={}

i=0
for index, group_df in markdown_df.reset_index().groupby(['id']):
    
#     print(index)
#     print(group_df)
#     print('\n\n',group_df['source'].nunique())
#     print(len(group_df))
    duplicated_cells=group_df[group_df['source'].duplicated(keep=False)]
    if i<10: 
        print(duplicated_cells[['cell_id','source']])
        print('-----------------------')
    
    dup_order=[]
    for cell_id in duplicated_cells['cell_id']:
        dup_order.append(order_dict[index].index(cell_id))
    
    #check if notebook has more than 1 cell that has a duplicate.
    id_to_dup_order[index]=(duplicated_cells['source'].nunique()==1,dup_order)
    i+=1
    
del markdown_df

In [None]:
iterations=10

i=0
for key, value in id_to_dup_order.items():
    print(f'{key}: {value}')
    
    if i > iterations: break
    i+=1

<p> 
    It looks like the duplicates are lazy comments or some generic line that they add to many to experiments. For example, there is a markdown text that just says export, and there are 3 of them! Maybe when exporting many files, the owner of the document just writes export. Another example is the one that starts with "Green markers denote crimes commited within". It is similar to adding footnotes to graphs in reports. You add the same text to all graphs to make it clear.
</p>

<p>
    How to deal with this values? If we switch any of these cells, the meaning does not change and it is not wrong. Thus, could potentially become malicious examples that may confuse the model. An idea is to consider the 3 of them the same cell (same 'cell_id').
</p>

In [None]:
def get_token_statistics(df,column='cell_type',value='markdown',spliter=' ',lower_percentile=10,upper_percentile=95,title=''):
    df_copy=df.copy()
    
    df_copy=df_copy[df_copy[column]==value]
    
    #Get the number of tokens
    df_copy['cell_len']=df_copy['source'].apply(lambda source: len(str(source).split(spliter)))
    
    print(f'The max number of tokens: {np.max(df_copy["cell_len"])}')
    print(f'The min number of tokens: {np.min(df_copy["cell_len"])}')
    print(f'The mean number of tokens: {np.mean(df_copy["cell_len"])}')
    print(f'The median number of tokens: {np.median(df_copy["cell_len"])}')
    print(f'The std number of tokens: {np.std(df_copy["cell_len"])}')
    print(f'The {upper_percentile} percentile of the number of tokens: {np.percentile(df_copy["cell_len"],upper_percentile)}')
    print(f'The {lower_percentile} percentile of the number of tokens: {np.percentile(df_copy["cell_len"],lower_percentile)}')
    
    sns.displot(data=df_copy.iloc[:50000].reset_index(),x='cell_len',kind='kde',hue='cell_type')
    
    ax=plt.gca()
    ax.set_title(title)

In [None]:
title='Number words in markdown cells'
get_token_statistics(train_df,title=title)

<p>
    The 95<sup>th</sup> percentile is 115. Thus, maybe a 128 max length for markdown is enough.
</p>

In [None]:
title='Number of lines in code cells'
get_token_statistics(train_df,column='cell_type',value='code',spliter='\n',lower_percentile=10,upper_percentile=95,title=title)

<p>
    The 95<sup>th</sup> percentile is 26. Thus, most cells have less than 26 lines of code.
</p>

In [None]:
max_markdown_cells=np.max(train_df[train_df['cell_type']=='markdown'].groupby('id').size())
print(f'max markdown cells: {max_markdown_cells}')
max_code_cells=np.max(train_df[train_df['cell_type']=='code'].groupby('id').size())
print(f'max code cells: {max_code_cells}')

In [None]:
markdown_cells_90=np.percentile(train_df[train_df['cell_type']=='markdown'].groupby('id').size(),90)
print(f'90th percentile markdown cells: {markdown_cells_90}')
code_cells_90=np.percentile(train_df[train_df['cell_type']=='code'].groupby('id').size(),90)
print(f'90th percentile code cells: {code_cells_90}')

<p>
    It seems there are a lot of cells in some notebooks. I do not expect deep learning to work well on these examples. If we use a sequential model, vanishing gradient will make it difficult for the model to learn. Sequence model do not do well for large sequences and the large sequences are just 10% of the whole dataset. Perhaps, using different techniques depending on the number of cells is best? Or maybe finding a method to concatenate cells? 
</p>
I will check some of these notebooks:

In [None]:
examples_df=pd.DataFrame()

for index,group in train_df.groupby('id'):
    if len(group)>800:
        examples_df=pd.concat((examples_df,group))
        
examples_df.to_csv('high_cardinal_examples.csv')

In [None]:
examples_df

I preffer to download the CSV file and open it in excel. In excel I can add a table and filter by "id". I think it is easier than in the notebook. Although, I have added a code snippet to display the cells of a notebook below: 

In [None]:
demostration_df=pd.DataFrame()
max_number=examples_df.reset_index()['id'].nunique()
print(f'There are {max_number} number of cells in the notebook')

example_number=4

i=1
for index, group in examples_df.groupby('id'):
    if i==example_number:
        demostration_df=group.copy()
        break
    i+=1

cell_number=(i for i in range(0,len(demostration_df)))
demostration_df['source'].apply(lambda source: print(f'[cell {next(cell_number)}]',source))