In [0]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import copy

**BLOCK PREPROCESSING**

In [0]:
def closest_tag (code_block, tag_arr, pos):
    closest = -1
    for i in range(len(tag_arr) - 1):
        if (tag_arr[i] < code_block and tag_arr[i + 1] > code_block):
            closest = i
            break
    if pos == 'end':
        closest += 1
    return tag_arr[closest]

In [0]:
def tag_preproc(tag_arr):
    for i in range(len(tag_arr)):
        tag_arr[i] = tag_arr[i].lower()
        tag_arr[i] = re.sub(r'[^a-z0-9\s]', '', tag_arr[i])
        # tag_arr[i] = tag_arr[i].replace(' ', '_') #   ????????????
        try:
            if tag_arr[i][0] == ' ':
                tag_arr[i][0] = ''
        except:
            pass

In [0]:
def block_preproc(blocks_arr, key, block_type):
    if block_type == 'code':
        too_many_ns = r'\\n' * 10
        for i in range(len(blocks_arr)):
            if too_many_ns in blocks_arr[i]:
                blocks_arr[i] = ''
    elif block_type == 'tag':
        for i in range(len(blocks_arr)):
            if 'https://' in blocks_arr[i]:
                if i == 0:
                    for j in range(i, len(blocks_arr)):
                        if 'https://' in blocks_arr[j]:
                            continue
                        else:
                            blocks_arr[i] = blocks_arr[j]
                            break
                else:
                    blocks_arr[i] = blocks_arr[i - 1]

    for i in range(len(blocks_arr)):
        blocks_arr[i] = re.sub(r'\\\\\\u0022|\\u0027', "'", blocks_arr[i])
        blocks_arr[i] = re.sub(r'\\u00..', '', blocks_arr[i])
        blocks_arr[i] = re.sub(r'\\\\n', '\n', blocks_arr[i])
        blocks_arr[i] = re.sub(r'\\', '', blocks_arr[i])
        blocks_arr[i] = re.sub(key, '', blocks_arr[i])
        if block_type == 'tag':
            blocks_arr[i] = blocks_arr[i].lower()
            blocks_arr[i] = re.sub(r'[^a-z0-9\s]', '', blocks_arr[i])
            # blocks_arr[i] = blocks_arr[i].replace(' ', '_') #   ????????????

In [0]:
def comment_finder(text):
    comments = []
    line_end = [i for i in range(len(text)) if text.startswith('\n', i)]
    try:
        if text[0] == '#':          # text is non-empty by the condition before running
            comments.append(text[0:line_end[0]])
    except: #let's add a try-pass in case there is a code block with just 1 comment
        pass
    comments_begin = [(i + 1) for i in range(len(text)) if text.startswith('\n#', i)]
    comments_end = []
    try:
        for i in range(len(comments_begin)):
            comments_end.append(closest_tag(comments_begin[i], line_end, pos='end'))
    except:
        pass

    try:
        for i in range(len(comments_begin)):
            comments.append(text[comments_begin[i] : comments_end[i]])
    except:
        pass

    no_preproc = copy.deepcopy(comments)
    tag_preproc(comments)
    return comments, no_preproc


**NOTEBOOK PARSING**

In [0]:
def notebook_parse (link):
    key_begin = r'\u0022cell_type\u0022:\u0022code\u0022,\u0022source\u0022:'
    key_end = r'\u0022execution_count\u0022:null,\u0022outputs\u0022:[]'

    tag_key_begin = r'\u0022cell_type\u0022:\u0022markdown\u0022,\u0022source\u0022:'
    tag_key_end = r'},{\u0022metadata\u0022:{'

    r = requests.get('https://www.kaggle.com/' + link) # 'https://www.kaggle.com/roshansharma/amazon-alexa-reviews'

    soup = BeautifulSoup(r.text, 'html.parser')
    scripts = soup.find_all('script')

    blocks = []
    tags = []

    for scr in scripts:
        test_str = scr.text

        if (key_begin in test_str) and (key_end in test_str):
            #   finding code blocks ---------------------------------------
            res_begin = [i for i in range(len(test_str)) if test_str.startswith(key_begin, i)] 
            res_end = [i for i in range(len(test_str)) if test_str.startswith(key_end, i)]

            try:
                for i in range(min(len(res_begin), len(res_end))):
                    blocks.append(test_str[res_begin[i] : res_end[i]])
            except:
                pass

            blocks = list(dict.fromkeys(blocks))
            length = len(blocks)
            res_begin = res_begin[:length]
            res_end = res_end[:length]
            #   finding tags ---------------------------------------
            tags_begin_ = [i for i in range(res_end[-1]) if test_str.startswith(tag_key_begin, i)]
            
            tags_begin = []
            try:
                for i in range(len(res_begin)):
                    tags_begin.append(closest_tag(res_begin[i], tags_begin_, pos = 'begin'))
            except:
                pass

            tags_end_ = [i for i in range(res_end[-1]) if test_str.startswith(tag_key_end, i)]
            
            tags_end = []
            try:
                for i in range(len(tags_begin)):
                    tags_end.append(closest_tag(tags_begin[i], tags_end_, pos='end'))
            except:
                pass

            try:
                for i in range(len(tags_begin)):
                    if tags_end[i] < tags_begin[i] and i != len(tags_begin) - 1:
                        tags_end[i] = tags_end[i + 1]
                    tags.append(test_str[tags_begin[i] : tags_end[i]])
            except:
                pass

    block_preproc(blocks, key = r'cell_type:code,source:', block_type = 'code')
    block_preproc(tags, key = r'cell_type:markdown,source:', block_type = 'tag')

    code_block = pd.Series(blocks, name='code_block')
    tag = pd.Series(tags, name='tag')
    out = pd.concat([code_block, tag], axis=1)

    # for i in range(len(blocks)):
    #     if '#' in blocks[i]:
    #         comments, no_preproc_comments = comment_finder(blocks[i])
    #         if len([x for x in out.loc[i, 'tag'].split()]) < 10:
    #             comments.append(out.loc[i, 'tag'])
    #         out.at[i, 'tag'] = comments
    #         for comment in no_preproc_comments:
    #             out.at[i, 'code_block'] = out.loc[i, 'code_block'].replace(comment, '')
    return out


In [44]:
df_1 = pd.read_csv('/content/kaggle_kernels_hotness.csv')
df_2 = pd.read_csv('/content/kaggle_kernels_scoreAscending.csv')
df_3 = pd.read_csv('/content/kaggle_kernels_scoreDescending.csv')
df_4 = pd.read_csv('/content/kaggle_kernels_voteCount.csv')
df_5 = pd.read_csv('/content/kaggle_kernels_commentCount.csv')
df_6 = pd.read_csv('/content/kaggle_kernels_viewCount.csv')
df_7 = pd.read_csv('/content/kk_2_4_2020.csv')
df_8 = pd.read_csv('/content/kk_4_4_2020.csv')
df_9 = pd.read_csv('/content/kk_6_4_2020.csv')
df_10 = pd.read_csv('/content/kk_8_4_2020.csv')
df_11 = pd.read_csv('/content/kk_10_4_2020.csv')
df_12 = pd.read_csv('/content/kk_12_4_2020.csv')
df_13 = pd.read_csv('/content/kk_14_4_2020.csv')

df = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10, df_11, df_12, df_13])

df.drop_duplicates(subset = 'ref', inplace = True)
df.sort_values(by = 'totalVotes', inplace = True, ascending = False)
df = df.reset_index(drop=True)

print(df.head())

                                                 ref  ... totalVotes
0  pmarcelino/comprehensive-data-exploration-with...  ...       6993
1  arthurtok/introduction-to-ensembling-stacking-...  ...       4656
2          startupsci/titanic-data-science-solutions  ...       4636
3   serigne/stacked-regressions-top-4-on-leaderboard  ...       4540
4  yassineghouzam/introduction-to-cnn-keras-0-997...  ...       4496

[5 rows x 5 columns]


In [45]:
res = []
kernels = df.iloc[:, 0]
empty_counter = 0

for i in range(len(kernels)):
    start_time = time.time()
    res.append(notebook_parse(kernels[i]))
    end_time = time.time()
    if (len(res[i]) == 0):
      empty_counter += 1
    print("notebook: # "+str(i)+'\t'+"number of code blocks: "+str(len(res[i]))+'\t'+"time: "+str(end_time - start_time))

notebook: # 0	number of code blocks: 32	time: 0.43831658363342285
notebook: # 1	number of code blocks: 2	time: 0.44982457160949707
notebook: # 2	number of code blocks: 52	time: 0.3543815612792969




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
notebook: # 4443	number of code blocks: 0	time: 0.25525975227355957
notebook: # 4444	number of code blocks: 5	time: 0.306577205657959
notebook: # 4445	number of code blocks: 2	time: 0.2758755683898926
notebook: # 4446	number of code blocks: 4	time: 0.4048893451690674
notebook: # 4447	number of code blocks: 31	time: 0.26604151725769043
notebook: # 4448	number of code blocks: 6	time: 0.356980562210083
notebook: # 4449	number of code blocks: 9	time: 0.24741291999816895
notebook: # 4450	number of code blocks: 21	time: 0.28610682487487793
notebook: # 4451	number of code blocks: 27	time: 0.4768240451812744
notebook: # 4452	number of code blocks: 3	time: 0.20545339584350586
notebook: # 4453	number of code blocks: 7	time: 0.2524600028991699
notebook: # 4454	number of code blocks: 31	time: 0.16892290115356445
notebook: # 4455	number of code blocks: 52	time: 0.28977131843566895
notebook: # 4456	number of code blocks: 20	time: 0.243

In [50]:
print(empty_counter * 100 / len(df))

24.539292522770598
