# Data Scrapping notebook for devsearch engine

This notebook is used to scrape and explore data to form the dataset for the devsearch engine. It includes functions to scrape data from various sources, clean it, and prepare it for analysis.

In [93]:
import polars as pl
from trafilatura.spider import focused_crawler
from trafilatura.sitemaps import sitemap_search
from trafilatura import bare_extraction

In [17]:
to_visit, known_links = focused_crawler("https://docs.python.org/3.10/", max_seen_urls = 1, max_known_urls=15)
to_visit

['https://docs.python.org/search.html',
 'https://docs.python.org/3.4/',
 'https://docs.python.org/contents.html',
 'https://docs.python.org/3.11/',
 'https://docs.python.org/about.html',
 'https://docs.python.org/3.2/',
 'https://docs.python.org/download.html',
 'https://docs.python.org/extending/index.html',
 'https://docs.python.org/3.6/',
 'https://docs.python.org/3.9/',
 'https://docs.python.org/3.7/',
 'https://docs.python.org/bugs.html',
 'https://docs.python.org/2.6/',
 'https://docs.python.org/glossary.html',
 'https://docs.python.org/genindex.html',
 'https://docs.python.org/distributing/index.html',
 'https://docs.python.org/3.5/',
 'https://docs.python.org/copyright.html',
 'https://docs.python.org/faq/index.html',
 'https://docs.python.org/reference/index.html',
 'https://docs.python.org/whatsnew/3.13.html',
 'https://docs.python.org/license.html',
 'https://docs.python.org/whatsnew/index.html',
 'https://docs.python.org/c-api/index.html',
 'https://docs.python.org/3.14/',

In [None]:
https://scikit-learn.org/stable/user_guide.html

In [31]:
from random import sample

In [62]:
kubernetes_url=sitemap_search("https://kubernetes.io", target_lang="fr")
python_url = sitemap_search("https://docs.python.org")
js_urls = sitemap_search("https://developer.mozilla.org", target_lang="fr")


In [63]:
site_list = ["docs.docker.com", "docs.djangoproject.com", "kubernetes.io", "docs.pytorch.org/docs/stable/index.html", "nextjs.org/sitemap.xml", "scikit-learn.org/stable/"]
url_list = []
for site in site_list:
    urls = sitemap_search(f"https://{site}", max_sitemaps=10)
    try :
        samples = sample(urls, 10)
    except ValueError as e:
        print("Error caught, not sampling")
        samples = urls

    url_list.extend(urls)
url_list

ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://docs.pytorch.org/sitemap_news.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://docs.pytorch.org/sitemap_index.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://docs.pytorch.org/sitemap
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://docs.pytorch.org/sitemap.xml.gz
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://docs.pytorch.org/sitemap.xml


Error caught, not sampling


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://scikit-learn.org/robots.txt
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://scikit-learn.org/sitemap_news.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://scikit-learn.org/sitemap_index.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://scikit-learn.org/sitemap
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://scikit-learn.org/sitemap.xml.gz
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://scikit-learn.org/sitemap.xml


Error caught, not sampling


['https://docs.docker.com/reference/api/hub/latest/',
 'https://docs.docker.com/get-started/introduction/get-docker-desktop/',
 'https://docs.docker.com/get-started/docker-concepts/running-containers/publishing-ports/',
 'https://docs.docker.com/get-started/docker-concepts/building-images/understanding-image-layers/',
 'https://docs.docker.com/get-started/introduction/develop-with-containers/',
 'https://docs.docker.com/reference/api/hub/latest-changelog/',
 'https://docs.docker.com/get-started/docker-concepts/running-containers/overriding-container-defaults/',
 'https://docs.docker.com/get-started/docker-concepts/building-images/writing-a-dockerfile/',
 'https://docs.docker.com/get-started/introduction/build-and-push-first-image/',
 'https://docs.docker.com/get-started/docker-concepts/building-images/build-tag-and-publish-an-image/',
 'https://docs.docker.com/reference/api/hub/deprecated/',
 'https://docs.docker.com/get-started/docker-concepts/running-containers/persisting-container-d

In [64]:
url_list.extend(kubernetes_url)
url_list.extend(js_urls)
url_list.extend(["https://developer.mozilla.org/fr/docs/Web/JavaScript", "https://scikit-learn.org/stable/user_guide.html"])

In [65]:
df = pl.DataFrame({
    "url": url_list
})
df

url
str
"""https://docs.docker.com/refere…"
"""https://docs.docker.com/get-st…"
"""https://docs.docker.com/get-st…"
"""https://docs.docker.com/get-st…"
"""https://docs.docker.com/get-st…"
…
"""https://kubernetes.io/fr/docs/…"
"""https://kubernetes.io/fr/"""
"""https://kubernetes.io/fr/docs/…"
"""https://developer.mozilla.org/…"


In [66]:
def get_language_from_url(url:str) -> str | None :
    try:
        if "python.org" in url: return "Python"
        if "nextjs.org" in url: return "NextJS"
        if "docker.com" in url: return "Docker"
        if "kubernetes" in url: return "Kubernetes"
        if "djangoproject.com" in url: return "Django"
        if "pytorch" in url: return "Pytorch"
        if"scikit-learn.org" in url: return "Scikit-Learn"
        if "mozilla" in url: return "JavaScript"
    except Exception as e:
        raise

In [67]:
df = df.with_columns(
    pl.col("url").map_elements(
        get_language_from_url,
        return_dtype=pl.String
    ).alias("language")
)
df

url,language
str,str
"""https://docs.docker.com/refere…","""Docker"""
"""https://docs.docker.com/get-st…","""Docker"""
"""https://docs.docker.com/get-st…","""Docker"""
"""https://docs.docker.com/get-st…","""Docker"""
"""https://docs.docker.com/get-st…","""Docker"""
…,…
"""https://kubernetes.io/fr/docs/…","""Kubernetes"""
"""https://kubernetes.io/fr/""","""Kubernetes"""
"""https://kubernetes.io/fr/docs/…","""Kubernetes"""
"""https://developer.mozilla.org/…","""JavaScript"""


In [68]:
df.group_by("language").count()

  df.group_by("language").count()


language,count
str,u32
"""JavaScript""",1
"""NextJS""",568
"""Scikit-Learn""",1
"""Kubernetes""",1817
"""Django""",8938
"""Docker""",1209


In [69]:
df.filter(
    pl.int_range(pl.len()).shuffle().over("language") < 10
)

url,language
str,str
"""https://docs.docker.com/get-st…","""Docker"""
"""https://docs.docker.com/guides…","""Docker"""
"""https://docs.docker.com/refere…","""Docker"""
"""https://docs.docker.com/refere…","""Docker"""
"""https://docs.docker.com/docker…","""Docker"""
…,…
"""https://nextjs.org/learn/pages…","""NextJS"""
"""https://nextjs.org/learn/pages…","""NextJS"""
"""https://nextjs.org/learn/seo/i…","""NextJS"""
"""https://developer.mozilla.org/…","""JavaScript"""


In [None]:
def get_crawled_urls(url:str) :
    to_visit, known_links = focused_crawler(url, max_seen_urls = 2, max_known_urls=150)
    return known_links    

In [75]:
df.with_columns(
    pl.col("url").map_elements(
        get_crawled_urls,
        return_dtype=list
    ).alias("crawled_urls")
)
df

KeyboardInterrupt: 

In [59]:
from trafilatura import fetch_url, extract

In [61]:
downloaded = fetch_url("https://docs.python.org/3.13/")
extract(downloaded, output_format="json", with_metadata=True)

'{"title": "Python 3.13 documentation", "author": null, "hostname": "python.org", "date": "2013-04-03", "fingerprint": "10333eedeccee290", "id": null, "license": null, "comments": "", "raw_text": "Python 3.13.4 documentation Welcome! This is the official documentation for Python 3.13.4. Documentation sections: What\'s new in Python 3.13? Or all \\"What\'s new\\" documents since Python 2.0 Tutorial Start here: a tour of Python\'s syntax and features Library reference Standard library and builtins Language reference Syntax and language elements Python setup and usage How to install, configure, and use Python Python HOWTOs In-depth topic manuals Installing Python modules Third-party modules and PyPI.org Distributing Python modules Publishing modules for use by other people Extending and embedding For C/C++ programmers Python\'s C API C API reference FAQs Frequently asked questions (with answers!) Deprecations Deprecated functionality Indices, glossary, and search: Global module index All 

In [107]:
df_SO = pl.read_csv("./SO_sample.csv", separator=";")
df_SO.head(10)

Unnamed: 0_level_0,id,title,body,comment_count,creation_date,last_activity_date,tags
i64,i64,str,str,i64,str,str,str
0,46372662,"""Is there something in an error…","""<p>I had an application where …",11,"""2017-09-22 20:21:24.467000+00:…","""2017-09-25 14:54:02.053000+00:…","""go|runtime-error"""
1,46495883,"""Appending a string at the end …","""<p>I'm relatively new to Java …",11,"""2017-09-29 19:40:39.813000+00:…","""2017-10-29 00:07:23.610000+00:…","""java|string|append|string-conc…"
2,46379422,"""How do I display a JavaScript …","""<p>I am trying to bring up a j…",11,"""2017-09-23 12:10:02.480000+00:…","""2017-09-25 02:29:41.053000+00:…","""javascript|php|html"""
3,46274943,"""Pagination of nested objects i…","""<p>I am starting out with angu…",11,"""2017-09-18 08:38:08.777000+00:…","""2017-09-18 14:49:35.607000+00:…","""angularjs|pagination"""
4,46271242,"""You do not have permission to …","""<p>I'm trying to write a progr…",11,"""2017-09-18 03:37:38.540000+00:…","""2017-09-18 17:17:28.080000+00:…","""python|python-2.7|libraries|no…"
5,46187761,"""what does this error mean (jqu…","""<p>its been a while since I us…",11,"""2017-09-13 02:08:04.683000+00:…","""2017-09-13 02:31:11.050000+00:…","""javascript|jquery"""
6,46362227,"""Passing a type specifier to a …","""<p>I am trying to replace a Ma…",11,"""2017-09-22 10:12:18.853000+00:…","""2017-09-22 14:02:40.967000+00:…","""c++|templates"""
7,46464557,"""View [layouts.default] not fou…","""<p>I am trying to run my larav…",11,"""2017-09-28 08:34:46.317000+00:…","""2017-09-28 10:06:19.807000+00:…","""laravel|layout|view"""
8,46467915,"""Volley Error Response Null""","""<p>I'm first time using JSONVo…",11,"""2017-09-28 11:22:40.257000+00:…","""2017-09-28 12:36:37.517000+00:…","""android|json|android-volley"""
9,46220852,"""Why does a text box appear beh…","""<p>I am writing an app that re…",11,"""2017-09-14 13:45:29.197000+00:…","""2017-09-14 14:27:00.727000+00:…","""android|bluetooth|android-ble"""


In [108]:
df_SO = df_SO.with_columns(
    pl.col("tags").str.split(by="|").alias("tag_list")
)
df_SO

Unnamed: 0_level_0,id,title,body,comment_count,creation_date,last_activity_date,tags,tag_list
i64,i64,str,str,i64,str,str,str,list[str]
0,46372662,"""Is there something in an error…","""<p>I had an application where …",11,"""2017-09-22 20:21:24.467000+00:…","""2017-09-25 14:54:02.053000+00:…","""go|runtime-error""","[""go"", ""runtime-error""]"
1,46495883,"""Appending a string at the end …","""<p>I'm relatively new to Java …",11,"""2017-09-29 19:40:39.813000+00:…","""2017-10-29 00:07:23.610000+00:…","""java|string|append|string-conc…","[""java"", ""string"", … ""string-concatenation""]"
2,46379422,"""How do I display a JavaScript …","""<p>I am trying to bring up a j…",11,"""2017-09-23 12:10:02.480000+00:…","""2017-09-25 02:29:41.053000+00:…","""javascript|php|html""","[""javascript"", ""php"", ""html""]"
3,46274943,"""Pagination of nested objects i…","""<p>I am starting out with angu…",11,"""2017-09-18 08:38:08.777000+00:…","""2017-09-18 14:49:35.607000+00:…","""angularjs|pagination""","[""angularjs"", ""pagination""]"
4,46271242,"""You do not have permission to …","""<p>I'm trying to write a progr…",11,"""2017-09-18 03:37:38.540000+00:…","""2017-09-18 17:17:28.080000+00:…","""python|python-2.7|libraries|no…","[""python"", ""python-2.7"", … ""notepad""]"
…,…,…,…,…,…,…,…,…
995,46299414,"""Symfony kernel.terminate Event…","""<p>Here's my service.yml</p> …",12,"""2017-09-19 11:36:11.733000+00:…","""2017-09-19 12:36:08.437000+00:…","""symfony|symfony-3.2""","[""symfony"", ""symfony-3.2""]"
996,46214068,"""Can't connect to my SQL Server…","""<p>I'm working on a tool that …",12,"""2017-09-14 08:25:15.837000+00:…","""2017-09-14 11:59:19.247000+00:…","""asp.net|entity-framework|azure…","[""asp.net"", ""entity-framework"", … ""sql-server-express""]"
997,46245951,"""Convert string to number in ty…","""<p>I need to split a string of…",12,"""2017-09-15 18:57:44.077000+00:…","""2022-04-09 05:54:04.073000+00:…","""javascript|typescript""","[""javascript"", ""typescript""]"
998,46463169,"""iOS linphone-SDK after adding …","""<p>My project use cocoa pods t…",13,"""2017-09-28 07:18:52.857000+00:…","""2017-09-28 09:33:21.263000+00:…","""ios|swift|linphone|linphone-sd…","[""ios"", ""swift"", … ""linphone-sdk""]"


In [109]:
df_SO = df_SO.drop(["comment_count", "tags"]).rename({
    "creation_date": "created_at",
    "last_activity_date": "modified_at",
    "tag_list": "tags",
    "body" : "content"
}
).with_columns(
    path = pl.lit(""),
    file_type = pl.lit("blog"),
    language = pl.lit("en"),
    github_stars = pl.lit(0),
    view_count = pl.lit(0),
    source = pl.lit("Stack Overflow"),
)
df_SO.head(10)

Unnamed: 0_level_0,id,title,content,created_at,modified_at,tags,path,file_type,language,github_stars,view_count,source
i64,i64,str,str,str,str,list[str],str,str,str,i32,i32,str
0,46372662,"""Is there something in an error…","""<p>I had an application where …","""2017-09-22 20:21:24.467000+00:…","""2017-09-25 14:54:02.053000+00:…","[""go"", ""runtime-error""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
1,46495883,"""Appending a string at the end …","""<p>I'm relatively new to Java …","""2017-09-29 19:40:39.813000+00:…","""2017-10-29 00:07:23.610000+00:…","[""java"", ""string"", … ""string-concatenation""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
2,46379422,"""How do I display a JavaScript …","""<p>I am trying to bring up a j…","""2017-09-23 12:10:02.480000+00:…","""2017-09-25 02:29:41.053000+00:…","[""javascript"", ""php"", ""html""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
3,46274943,"""Pagination of nested objects i…","""<p>I am starting out with angu…","""2017-09-18 08:38:08.777000+00:…","""2017-09-18 14:49:35.607000+00:…","[""angularjs"", ""pagination""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
4,46271242,"""You do not have permission to …","""<p>I'm trying to write a progr…","""2017-09-18 03:37:38.540000+00:…","""2017-09-18 17:17:28.080000+00:…","[""python"", ""python-2.7"", … ""notepad""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
5,46187761,"""what does this error mean (jqu…","""<p>its been a while since I us…","""2017-09-13 02:08:04.683000+00:…","""2017-09-13 02:31:11.050000+00:…","[""javascript"", ""jquery""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
6,46362227,"""Passing a type specifier to a …","""<p>I am trying to replace a Ma…","""2017-09-22 10:12:18.853000+00:…","""2017-09-22 14:02:40.967000+00:…","[""c++"", ""templates""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
7,46464557,"""View [layouts.default] not fou…","""<p>I am trying to run my larav…","""2017-09-28 08:34:46.317000+00:…","""2017-09-28 10:06:19.807000+00:…","[""laravel"", ""layout"", ""view""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
8,46467915,"""Volley Error Response Null""","""<p>I'm first time using JSONVo…","""2017-09-28 11:22:40.257000+00:…","""2017-09-28 12:36:37.517000+00:…","[""android"", ""json"", ""android-volley""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
9,46220852,"""Why does a text box appear beh…","""<p>I am writing an app that re…","""2017-09-14 13:45:29.197000+00:…","""2017-09-14 14:27:00.727000+00:…","[""android"", ""bluetooth"", ""android-ble""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""


In [110]:
from bs4 import BeautifulSoup
import re

In [111]:
def personal_extractor(text: str):

    if not isinstance(text, str):
        return ""
   
    soup = BeautifulSoup(text, "html.parser")
   
    for pre_tag in soup.find_all("pre"):
        pre_tag.decompose()
    
    for code_tag in soup.find_all("code"):
        code_tag.decompose()
    
    text_content = soup.get_text(separator=" ", strip=True)

    text_content = re.sub(r'\s+', " ", text_content).strip()
    return text_content


In [113]:
df_SO = df_SO.with_columns(
    pl.col("content").map_elements(
        personal_extractor,
        return_dtype=str
    ).alias("raw HTML")
).drop(["content"]).rename({"raw HTML": "content"})
df_SO.head(10)

Unnamed: 0_level_0,id,title,created_at,modified_at,tags,path,file_type,language,github_stars,view_count,source,content
i64,i64,str,str,str,list[str],str,str,str,i32,i32,str,str
0,46372662,"""Is there something in an error…","""2017-09-22 20:21:24.467000+00:…","""2017-09-25 14:54:02.053000+00:…","[""go"", ""runtime-error""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""I had an application where I w…"
1,46495883,"""Appending a string at the end …","""2017-09-29 19:40:39.813000+00:…","""2017-10-29 00:07:23.610000+00:…","[""java"", ""string"", … ""string-concatenation""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""I'm relatively new to Java but…"
2,46379422,"""How do I display a JavaScript …","""2017-09-23 12:10:02.480000+00:…","""2017-09-25 02:29:41.053000+00:…","[""javascript"", ""php"", ""html""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""I am trying to bring up a java…"
3,46274943,"""Pagination of nested objects i…","""2017-09-18 08:38:08.777000+00:…","""2017-09-18 14:49:35.607000+00:…","[""angularjs"", ""pagination""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""I am starting out with angular…"
4,46271242,"""You do not have permission to …","""2017-09-18 03:37:38.540000+00:…","""2017-09-18 17:17:28.080000+00:…","[""python"", ""python-2.7"", … ""notepad""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""I'm trying to write a program …"
5,46187761,"""what does this error mean (jqu…","""2017-09-13 02:08:04.683000+00:…","""2017-09-13 02:31:11.050000+00:…","[""javascript"", ""jquery""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""its been a while since I used …"
6,46362227,"""Passing a type specifier to a …","""2017-09-22 10:12:18.853000+00:…","""2017-09-22 14:02:40.967000+00:…","[""c++"", ""templates""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""I am trying to replace a Macro…"
7,46464557,"""View [layouts.default] not fou…","""2017-09-28 08:34:46.317000+00:…","""2017-09-28 10:06:19.807000+00:…","[""laravel"", ""layout"", ""view""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""I am trying to run my laravel …"
8,46467915,"""Volley Error Response Null""","""2017-09-28 11:22:40.257000+00:…","""2017-09-28 12:36:37.517000+00:…","[""android"", ""json"", ""android-volley""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""I'm first time using JSONVolle…"
9,46220852,"""Why does a text box appear beh…","""2017-09-14 13:45:29.197000+00:…","""2017-09-14 14:27:00.727000+00:…","[""android"", ""bluetooth"", ""android-ble""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""I am writing an app that requi…"


In [117]:
import json
with open("./sample_dataset.json", "a") as f:
    df_SO.drop('').write_json(f)

In [120]:
df_github = pl.read_parquet("./train-00000-of-00008.parquet")
df_github.head(5)

repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_documentation_string,func_code_url
str,str,str,str,str,str,str,str
"""proycon/pynlpl""","""pynlpl/formats/folia.py""","""AbstractElement.addidsuffix""","""def addidsuffix(self, idsuffix…","""python""","""def addidsuffix(self, idsuffix…","""Appends a suffix to this eleme…","""https://github.com/proycon/pyn…"
"""proycon/pynlpl""","""pynlpl/formats/folia.py""","""AbstractElement.setparents""","""def setparents(self):  …","""python""","""def setparents(self):  …","""Correct all parent relations f…","""https://github.com/proycon/pyn…"
"""proycon/pynlpl""","""pynlpl/formats/folia.py""","""AbstractElement.setdoc""","""def setdoc(self,newdoc):  …","""python""","""def setdoc(self,newdoc):  …","""Set a different document. Usua…","""https://github.com/proycon/pyn…"
"""proycon/pynlpl""","""pynlpl/formats/folia.py""","""AbstractElement.hastext""","""def hastext(self,cls='current'…","""python""","""def hastext(self,cls='current'…","""Does this element have text (o…","""https://github.com/proycon/pyn…"
"""proycon/pynlpl""","""pynlpl/formats/folia.py""","""AbstractElement.hasphon""","""def hasphon(self,cls='current'…","""python""","""def hasphon(self,cls='current'…","""Does this element have phoneti…","""https://github.com/proycon/pyn…"


In [123]:
df_github = df_github.drop(['func_path_in_repository', "repository_name", "whole_func_string","func_code_string"]).rename({
    "language" : 'tags',
    "func_documentation_string": "content",
    "func_code_url": "path",
}).with_columns(
    id = "doc_" + pl.col("func_name"),
    file_type = pl.lit("repository"),
    language = pl.lit("en"),
    github_stars = pl.lit(0),
    view_count = pl.lit(0),
    source = pl.lit("GitHub"),
    title = pl.col("func_name") + "Docstring Documentation"
)
df_github.head(5)

func_name,tags,content,path,id,file_type,language,github_stars,view_count,source,title
str,str,str,str,str,str,str,i32,i32,str,str
"""AbstractElement.addidsuffix""","""python""","""Appends a suffix to this eleme…","""https://github.com/proycon/pyn…","""doc_AbstractElement.addidsuffi…","""repository""","""en""",0,0,"""GitHub""","""AbstractElement.addidsuffixDoc…"
"""AbstractElement.setparents""","""python""","""Correct all parent relations f…","""https://github.com/proycon/pyn…","""doc_AbstractElement.setparents""","""repository""","""en""",0,0,"""GitHub""","""AbstractElement.setparentsDocs…"
"""AbstractElement.setdoc""","""python""","""Set a different document. Usua…","""https://github.com/proycon/pyn…","""doc_AbstractElement.setdoc""","""repository""","""en""",0,0,"""GitHub""","""AbstractElement.setdocDocstrin…"
"""AbstractElement.hastext""","""python""","""Does this element have text (o…","""https://github.com/proycon/pyn…","""doc_AbstractElement.hastext""","""repository""","""en""",0,0,"""GitHub""","""AbstractElement.hastextDocstri…"
"""AbstractElement.hasphon""","""python""","""Does this element have phoneti…","""https://github.com/proycon/pyn…","""doc_AbstractElement.hasphon""","""repository""","""en""",0,0,"""GitHub""","""AbstractElement.hasphonDocstri…"


In [124]:
with open("./test.json", "w") as f:
    df_github.sample(5000).write_json(f)