In [0]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import copy
import numpy as np

**BLOCK PREPROCESSING**

In [0]:
def closest_tag (code_block, tag_arr, pos):
    closest = -1
    for i in range(len(tag_arr) - 1):
        if (tag_arr[i] < code_block and tag_arr[i + 1] > code_block):
            closest = i
            break
    if pos == 'end':
        closest += 1
    return tag_arr[closest]

In [0]:
def tag_preproc(tag_arr):
    for i in range(len(tag_arr)):
        tag_arr[i] = tag_arr[i].lower()
        tag_arr[i] = re.sub(r'[^a-z0-9\s]', '', tag_arr[i])
        # tag_arr[i] = tag_arr[i].replace(' ', '_') #   ????????????
        try:
            if tag_arr[i][0] == ' ':
                tag_arr[i][0] = ''
        except:
            pass

In [0]:
def block_preproc(blocks_arr, key, block_type):
    if block_type == 'code':
        too_many_ns = r'\\n' * 10
        for i in range(len(blocks_arr)):
            if too_many_ns in blocks_arr[i]:
                blocks_arr[i] = ''
    elif block_type == 'tag':
        for i in range(len(blocks_arr)):
            if 'https://' in blocks_arr[i]:
                if i == 0:
                    for j in range(i, len(blocks_arr)):
                        if 'https://' in blocks_arr[j]:
                            continue
                        else:
                            blocks_arr[i] = blocks_arr[j]
                            break
                else:
                    blocks_arr[i] = blocks_arr[i - 1]

    for i in range(len(blocks_arr)):
        blocks_arr[i] = re.sub(r'\\\\\\u0022|\\u0027', "'", blocks_arr[i])
        blocks_arr[i] = re.sub(r'\\u....', '', blocks_arr[i])
        blocks_arr[i] = re.sub(r'\\\\n', '\n', blocks_arr[i])
        blocks_arr[i] = re.sub(r'\\', '', blocks_arr[i])
        blocks_arr[i] = re.sub(key, '', blocks_arr[i])

        if block_type == 'tag':
            blocks_arr[i] = blocks_arr[i].lower()
            blocks_arr[i] = re.sub(r'[^a-z0-9\s]', '', blocks_arr[i])
            # blocks_arr[i] = blocks_arr[i].replace(' ', '_') #   ????????????

    

In [0]:
def comment_finder(text):
    comments = []
    line_end = [i for i in range(len(text)) if text.startswith('\n', i)]
    try:
        if text[0] == '#':          # text is non-empty by the condition before running
            comments.append(text[0:line_end[0]])
    except: #let's add a try-pass in case there is a code block with just 1 comment
        pass

    comments_begin = [(i) for i in range(len(text)) if text.startswith('#', i)]
    comments_end = []
    try:
        for i in range(len(comments_begin)):
            comments_end.append(closest_tag(comments_begin[i], line_end, pos='end'))
    except:
        pass

    try:
        for i in range(len(comments_begin)):
            comments.append(text[comments_begin[i] : comments_end[i]])
    except:
        pass

    no_preproc = copy.deepcopy(comments)
    tag_preproc(comments)
    return comments, no_preproc


**NOTEBOOK PARSING**

In [0]:
def notebook_parse (link):
    key_begin = r'\u0022cell_type\u0022:\u0022code\u0022,\u0022source\u0022:'
    key_end = r'\u0022execution_count\u0022:'#((null)|\d{1}),\u0022outputs\u0022:[]'
    tag_key_begin = r'\u0022cell_type\u0022:\u0022markdown\u0022,\u0022source\u0022:'
    tag_key_end = r'},{\u0022metadata\u0022:{'
    key = '@@@@'
# \u0022execution_count\u0022:,\u0022outputs\u0022:[]
    r = requests.get('https://www.kaggle.com/' + link) # 'https://www.kaggle.com/roshansharma/amazon-alexa-reviews'

    # r = requests.get('https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python')

    soup = BeautifulSoup(r.text, 'html.parser')
    scripts = soup.find_all('script')

    blocks = []
    tags = []

    for scr in scripts:
        test_str = scr.text

        if (key_begin in test_str) and (key_end in test_str):
            #   finding code blocks ---------------------------------------
            res_begin = [i for i in range(len(test_str)) if test_str.startswith(key_begin, i)] 
            res_end = [i for i in range(len(test_str)) if test_str.startswith(key_end, i)]

            try:
                for i in range(min(len(res_begin), len(res_end))):
                    blocks.append(test_str[res_begin[i] : res_end[i]])
            except:
                pass

            blocks = list(dict.fromkeys(blocks))
            length = len(blocks)
            res_begin = res_begin[:length]
            res_end = res_end[:length]
            #   finding tags ---------------------------------------
            tags_begin_ = [i for i in range(res_end[-1]) if test_str.startswith(tag_key_begin, i)]
            
            tags_begin = []
            try:
                for i in range(len(res_begin)):
                    tags_begin.append(closest_tag(res_begin[i], tags_begin_, pos = 'begin'))
            except:
                pass

            tags_end_ = [i for i in range(res_end[-1]) if test_str.startswith(tag_key_end, i)]
            
            tags_end = []
            try:
                for i in range(len(tags_begin)):
                    tags_end.append(closest_tag(tags_begin[i], tags_end_, pos='end'))
            except:
                pass

            try:
                for i in range(len(tags_begin)):
                    if tags_end[i] < tags_begin[i] and i != len(tags_begin) - 1:
                        tags_end[i] = tags_end[i + 1]
                    tags.append(test_str[tags_begin[i] : tags_end[i]])
            except:
                pass

    block_preproc(blocks, key = r'cell_type:code,source:', block_type = 'code')
    block_preproc(tags, key = r'cell_type:markdown,source:', block_type = 'tag')

    code_block = pd.Series(blocks, name='code_block')
    tag = pd.Series(tags, name='tag')
    out = pd.concat([code_block, tag], axis=1).fillna(key)

    for i in range(len(blocks)):
        if '#' in blocks[i]:
          comments, no_preproc_comments = comment_finder(blocks[i])
          if len([x for x in str(out.loc[i, 'tag']).split()]) < 10:
            comments.append(out.loc[i, 'tag'])
          
          try:
            comments.remove(key)
          except ValueError:
            pass
          
          if comments == []:
            out.at[i, 'tag'] = np.nan
          else:
            out.at[i, 'tag'] = list(dict.fromkeys(comments))
          for comment in no_preproc_comments:
            out.at[i, 'code_block'] = out.loc[i, 'code_block'].replace(comment, '')

        if out.loc[i, 'tag'] == key or out.loc[i, 'tag'] == [key]:
          out.at[i, 'tag'] = np.nan

    out['code_block'] = out['code_block'].apply(lambda text: re.sub(r"(\n)\1+", '', text))

    return out.dropna().reset_index(drop=True)


In [20]:
df_1 = pd.read_csv('/content/kaggle_kernels_hotness.csv')
df_2 = pd.read_csv('/content/kaggle_kernels_scoreAscending.csv')
df_3 = pd.read_csv('/content/kaggle_kernels_scoreDescending.csv')
df_4 = pd.read_csv('/content/kaggle_kernels_voteCount.csv')
df_5 = pd.read_csv('/content/kaggle_kernels_commentCount.csv')
df_6 = pd.read_csv('/content/kaggle_kernels_viewCount.csv')
df_7 = pd.read_csv('/content/kk_2_4_2020.csv')
df_8 = pd.read_csv('/content/kk_4_4_2020.csv')
df_9 = pd.read_csv('/content/kk_6_4_2020.csv')
df_10 = pd.read_csv('/content/kk_8_4_2020.csv')
df_11 = pd.read_csv('/content/kk_10_4_2020.csv')
df_12 = pd.read_csv('/content/kk_12_4_2020.csv')
df_13 = pd.read_csv('/content/kk_14_4_2020.csv')
df_14 = pd.read_csv('/content/kk_16_4_2020.csv')
df_15 = pd.read_csv('/content/kk_18_4_2020.csv')
df_16 = pd.read_csv('/content/kk_20_4_2020.csv')
df_17 = pd.read_csv('/content/kk_22_4_2020.csv')
df_18 = pd.read_csv('/content/kk_25_4_2020.csv')


in_df = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10, 
                  df_11, df_12, df_13, df_14, df_15, df_16, df_17, df_18])

in_df.drop_duplicates(subset = 'ref', inplace = True)
in_df.sort_values(by = 'totalVotes', inplace = True, ascending = False)
in_df = in_df.reset_index(drop=True)

print(len(in_df))

in_df.to_csv('/content/all_kernels.csv', sep='\t', encoding='utf-8')

13657


In [13]:
res = []
kernels = in_df.iloc[:, 0]
empty_counter = 0

for i in range(len(kernels)):
    start_time = time.time()
    try:
      res.append(notebook_parse(kernels[i]))
    except:
      pass
    end_time = time.time()
    if (len(notebook_parse(kernels[i])) == 0):
      empty_counter += 1
    print("notebook: # "+str(i)+'\t'+"number of code blocks: "+str(len(res[-1]))+'\t'+"time: "+str(end_time - start_time))

notebook: # 0	number of code blocks: 32	time: 0.47432827949523926
notebook: # 1	number of code blocks: 25	time: 0.3840353488922119
notebook: # 2	number of code blocks: 52	time: 0.33525609970092773




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
notebook: # 8658	number of code blocks: 4	time: 0.41478490829467773
notebook: # 8659	number of code blocks: 22	time: 0.4834122657775879
notebook: # 8660	number of code blocks: 12	time: 0.26920151710510254
notebook: # 8661	number of code blocks: 12	time: 0.19915437698364258
notebook: # 8662	number of code blocks: 10	time: 0.22484612464904785
notebook: # 8663	number of code blocks: 0	time: 0.18678975105285645
notebook: # 8664	number of code blocks: 20	time: 0.26114416122436523
notebook: # 8665	number of code blocks: 2	time: 0.34743261337280273
notebook: # 8666	number of code blocks: 5	time: 0.33632588386535645
notebook: # 8667	number of code blocks: 4	time: 0.3240034580230713
notebook: # 8668	number of code blocks: 5	time: 0.21591973304748535
notebook: # 8669	number of code blocks: 1	time: 0.22997403144836426
notebook: # 8670	number of code blocks: 5	time: 0.30541157722473145
notebook: # 8671	number of code blocks: 4	time: 

In [14]:
print(empty_counter * 100 / len(kernels))

8.815991799077397


In [0]:
out_df = pd.concat(res)

In [0]:
out_df = out_df[out_df['code_block'].astype(bool)]
out_df = out_df[out_df['tag'].astype(bool)]

In [0]:
out_df.to_csv('/content/drive/My Drive/code_blocks.csv', sep='\t', encoding='utf-8', index=False)

In [18]:
all_code = ''

for i in range(len(out_df)):
  code = out_df.iloc[i, 0]
  if code[0] == '\n':
    code = code.replace('\n', '', 1)
  if code[-1] != '\n':
    code += '\n'
  all_code += code

print(len(all_code))

83459809


In [19]:
line_ends = [i for i in range(len(all_code)) if all_code.startswith('\n', i)]
print(len(line_ends))
cnt_30 = 0
chunks_30 = []
for i in range(len(line_ends)):
  cnt_30 += 1
  if cnt_30 < 30:
    continue
  else:
    cnt_30 = 0
    chunks_30.append(all_code[line_ends[i - 29] + 1: line_ends[i]])

1642180


In [0]:
pd.DataFrame(data={'code': chunks_30}).to_csv('/content/drive/My Drive/chunks_30.csv', sep='\t', encoding='utf-8', index=False)

In [0]:
# # code_df = pd.read_csv('/content/code_blocks.csv')
# code_df = out_df
# df = pd.DataFrame(data={'code': [], 'tag': []})
# buffer = pd.DataFrame(data={'code':[], 'tag': []})
# length = 0
# counter = 0
# code_df.head()
# buf = []

# for i in range(len(code_df)):
#   if length < 30:
#     buffer = buffer.append({'code': code_df.iloc[i, 0], 'tag': code_df.iloc[i, 1]}, ignore_index=True)
#     buf = [j for j in range(len(code_df.iloc[i, 0])) if code_df.iloc[i, 0].startswith('\n', j)]
#     if 0 in buf:
#       buf.remove(0)
#     print(buf)
#     length += len(buf)
#   else:
#     df = df.append({'code': buffer.iloc[0, 0], 'tag': buffer.iloc[0, 1]}, ignore_index=True)
#     print("block number:\t" + str(counter))
#     counter += 1
#     buffer = buffer.iloc[0:0]
#     length = 0

In [0]:
# df.to_csv('/content/final_df.csv', sep='\t', encoding='utf-8')