# Data Scrapping notebook for devsearch engine

This notebook is used to scrape and explore data to form the dataset for the devsearch engine. It includes functions to scrape data from various sources, clean it, and prepare it for analysis.

In [93]:
import polars as pl
from trafilatura.spider import focused_crawler
from trafilatura.sitemaps import sitemap_search
from trafilatura import bare_extraction

In [17]:
to_visit, known_links = focused_crawler("https://docs.python.org/3.10/", max_seen_urls = 1, max_known_urls=15)
to_visit

['https://docs.python.org/search.html',
 'https://docs.python.org/3.4/',
 'https://docs.python.org/contents.html',
 'https://docs.python.org/3.11/',
 'https://docs.python.org/about.html',
 'https://docs.python.org/3.2/',
 'https://docs.python.org/download.html',
 'https://docs.python.org/extending/index.html',
 'https://docs.python.org/3.6/',
 'https://docs.python.org/3.9/',
 'https://docs.python.org/3.7/',
 'https://docs.python.org/bugs.html',
 'https://docs.python.org/2.6/',
 'https://docs.python.org/glossary.html',
 'https://docs.python.org/genindex.html',
 'https://docs.python.org/distributing/index.html',
 'https://docs.python.org/3.5/',
 'https://docs.python.org/copyright.html',
 'https://docs.python.org/faq/index.html',
 'https://docs.python.org/reference/index.html',
 'https://docs.python.org/whatsnew/3.13.html',
 'https://docs.python.org/license.html',
 'https://docs.python.org/whatsnew/index.html',
 'https://docs.python.org/c-api/index.html',
 'https://docs.python.org/3.14/',

In [None]:
https://scikit-learn.org/stable/user_guide.html

In [31]:
from random import sample

In [62]:
kubernetes_url=sitemap_search("https://kubernetes.io", target_lang="fr")
python_url = sitemap_search("https://docs.python.org")
js_urls = sitemap_search("https://developer.mozilla.org", target_lang="fr")


In [63]:
site_list = ["docs.docker.com", "docs.djangoproject.com", "kubernetes.io", "docs.pytorch.org/docs/stable/index.html", "nextjs.org/sitemap.xml", "scikit-learn.org/stable/"]
url_list = []
for site in site_list:
    urls = sitemap_search(f"https://{site}", max_sitemaps=10)
    try :
        samples = sample(urls, 10)
    except ValueError as e:
        print("Error caught, not sampling")
        samples = urls

    url_list.extend(urls)
url_list

ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://docs.pytorch.org/sitemap_news.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://docs.pytorch.org/sitemap_index.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://docs.pytorch.org/sitemap
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://docs.pytorch.org/sitemap.xml.gz
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://docs.pytorch.org/sitemap.xml


Error caught, not sampling


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://scikit-learn.org/robots.txt
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://scikit-learn.org/sitemap_news.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://scikit-learn.org/sitemap_index.xml
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://scikit-learn.org/sitemap
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://scikit-learn.org/sitemap.xml.gz
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://scikit-learn.org/sitemap.xml


Error caught, not sampling


['https://docs.docker.com/reference/api/hub/latest/',
 'https://docs.docker.com/get-started/introduction/get-docker-desktop/',
 'https://docs.docker.com/get-started/docker-concepts/running-containers/publishing-ports/',
 'https://docs.docker.com/get-started/docker-concepts/building-images/understanding-image-layers/',
 'https://docs.docker.com/get-started/introduction/develop-with-containers/',
 'https://docs.docker.com/reference/api/hub/latest-changelog/',
 'https://docs.docker.com/get-started/docker-concepts/running-containers/overriding-container-defaults/',
 'https://docs.docker.com/get-started/docker-concepts/building-images/writing-a-dockerfile/',
 'https://docs.docker.com/get-started/introduction/build-and-push-first-image/',
 'https://docs.docker.com/get-started/docker-concepts/building-images/build-tag-and-publish-an-image/',
 'https://docs.docker.com/reference/api/hub/deprecated/',
 'https://docs.docker.com/get-started/docker-concepts/running-containers/persisting-container-d

In [64]:
url_list.extend(kubernetes_url)
url_list.extend(js_urls)
url_list.extend(["https://developer.mozilla.org/fr/docs/Web/JavaScript", "https://scikit-learn.org/stable/user_guide.html"])

In [65]:
df = pl.DataFrame({
    "url": url_list
})
df

url
str
"""https://docs.docker.com/refere…"
"""https://docs.docker.com/get-st…"
"""https://docs.docker.com/get-st…"
"""https://docs.docker.com/get-st…"
"""https://docs.docker.com/get-st…"
…
"""https://kubernetes.io/fr/docs/…"
"""https://kubernetes.io/fr/"""
"""https://kubernetes.io/fr/docs/…"
"""https://developer.mozilla.org/…"


In [66]:
def get_language_from_url(url:str) -> str | None :
    try:
        if "python.org" in url: return "Python"
        if "nextjs.org" in url: return "NextJS"
        if "docker.com" in url: return "Docker"
        if "kubernetes" in url: return "Kubernetes"
        if "djangoproject.com" in url: return "Django"
        if "pytorch" in url: return "Pytorch"
        if"scikit-learn.org" in url: return "Scikit-Learn"
        if "mozilla" in url: return "JavaScript"
    except Exception as e:
        raise

In [67]:
df = df.with_columns(
    pl.col("url").map_elements(
        get_language_from_url,
        return_dtype=pl.String
    ).alias("language")
)
df

url,language
str,str
"""https://docs.docker.com/refere…","""Docker"""
"""https://docs.docker.com/get-st…","""Docker"""
"""https://docs.docker.com/get-st…","""Docker"""
"""https://docs.docker.com/get-st…","""Docker"""
"""https://docs.docker.com/get-st…","""Docker"""
…,…
"""https://kubernetes.io/fr/docs/…","""Kubernetes"""
"""https://kubernetes.io/fr/""","""Kubernetes"""
"""https://kubernetes.io/fr/docs/…","""Kubernetes"""
"""https://developer.mozilla.org/…","""JavaScript"""


In [68]:
df.group_by("language").count()

  df.group_by("language").count()


language,count
str,u32
"""JavaScript""",1
"""NextJS""",568
"""Scikit-Learn""",1
"""Kubernetes""",1817
"""Django""",8938
"""Docker""",1209


In [69]:
df.filter(
    pl.int_range(pl.len()).shuffle().over("language") < 10
)

url,language
str,str
"""https://docs.docker.com/get-st…","""Docker"""
"""https://docs.docker.com/guides…","""Docker"""
"""https://docs.docker.com/refere…","""Docker"""
"""https://docs.docker.com/refere…","""Docker"""
"""https://docs.docker.com/docker…","""Docker"""
…,…
"""https://nextjs.org/learn/pages…","""NextJS"""
"""https://nextjs.org/learn/pages…","""NextJS"""
"""https://nextjs.org/learn/seo/i…","""NextJS"""
"""https://developer.mozilla.org/…","""JavaScript"""


In [None]:
def get_crawled_urls(url:str) :
    to_visit, known_links = focused_crawler(url, max_seen_urls = 2, max_known_urls=150)
    return known_links    

In [75]:
df.with_columns(
    pl.col("url").map_elements(
        get_crawled_urls,
        return_dtype=list
    ).alias("crawled_urls")
)
df

KeyboardInterrupt: 

In [59]:
from trafilatura import fetch_url, extract

In [61]:
downloaded = fetch_url("https://docs.python.org/3.13/")
extract(downloaded, output_format="json", with_metadata=True)

'{"title": "Python 3.13 documentation", "author": null, "hostname": "python.org", "date": "2013-04-03", "fingerprint": "10333eedeccee290", "id": null, "license": null, "comments": "", "raw_text": "Python 3.13.4 documentation Welcome! This is the official documentation for Python 3.13.4. Documentation sections: What\'s new in Python 3.13? Or all \\"What\'s new\\" documents since Python 2.0 Tutorial Start here: a tour of Python\'s syntax and features Library reference Standard library and builtins Language reference Syntax and language elements Python setup and usage How to install, configure, and use Python Python HOWTOs In-depth topic manuals Installing Python modules Third-party modules and PyPI.org Distributing Python modules Publishing modules for use by other people Extending and embedding For C/C++ programmers Python\'s C API C API reference FAQs Frequently asked questions (with answers!) Deprecations Deprecated functionality Indices, glossary, and search: Global module index All 

In [171]:
ORDER = ['id',"title", "content", "file_type", "path", "created_at", "modified_at", "github_stars", "view_count","language", "tags", "source"]

In [163]:
df_SO = pl.read_csv("./SO_sample.csv", separator=";", try_parse_dates=True)
df_SO.head(10)

Unnamed: 0_level_0,id,title,body,comment_count,creation_date,last_activity_date,tags
i64,i64,str,str,i64,"datetime[μs, UTC]","datetime[μs, UTC]",str
0,42067644,"""Data sharing (one way communic…","""<p>Consider a data being gener…",11,2017-02-06 12:18:31.570 UTC,2017-02-06 15:08:30.253 UTC,"""android|service|usb|background…"
1,41695757,"""Error encountered while pushin…","""<p>When i try to push any git …",11,2017-01-17 11:18:08.233 UTC,2017-01-17 12:45:51.670 UTC,"""git|visual-studio|visual-studi…"
2,41901824,"""Seems like a bug in Azure SQL …","""<p>Below is the script I am ru…",11,2017-01-27 19:42:52.373 UTC,2017-02-25 16:40:22.680 UTC,"""sql|sql-server|azure|azure-sql…"
3,42122582,"""Android deep link gives web pa…","""<p>I'm trying to share a link …",11,2017-02-08 20:03:58.070 UTC,2019-12-18 04:20:18.570 UTC,"""android|deep-linking"""
4,42082800,"""segfault when indexing [] into…","""<p>So my unordered map is a ma…",11,2017-02-07 06:02:19.583 UTC,2017-02-07 06:18:36.600 UTC,"""c++"""
5,41808100,"""Read file as hex in C""","""<p>This is my current code:</p…",11,2017-01-23 13:59:24.080 UTC,2017-01-23 16:05:32.263 UTC,"""c|file"""
6,41993460,"""500 Internal Server Error. mal…","""<p>Kindly help me, </p> <p>M…",11,2017-02-02 02:54:12.757 UTC,2017-02-02 02:54:12.757 UTC,"""php|apache|laravel|laravel-5.3"""
7,41973876,"""Strange behavior while calling…","""<p>I am trying to use REFPROPs…",11,2017-02-01 07:09:25.650 UTC,2017-02-02 09:33:52.140 UTC,"""fortran|gfortran"""
8,41816885,"""Copying the lines from a file …","""<p>I'm trying to copy the file…",11,2017-01-23 22:34:28.947 UTC,2017-01-28 13:55:57.107 UTC,"""c"""
9,41794627,"""Can't create cron job in Googl…","""<p>When I deploy my applicatio…",11,2017-01-22 18:28:39.233 UTC,2017-03-07 00:20:34.963 UTC,"""java|google-app-engine|cron"""


In [164]:
df_SO = df_SO.with_columns(
    pl.col("tags").str.split(by="|").alias("tag_list")
)
df_SO

Unnamed: 0_level_0,id,title,body,comment_count,creation_date,last_activity_date,tags,tag_list
i64,i64,str,str,i64,"datetime[μs, UTC]","datetime[μs, UTC]",str,list[str]
0,42067644,"""Data sharing (one way communic…","""<p>Consider a data being gener…",11,2017-02-06 12:18:31.570 UTC,2017-02-06 15:08:30.253 UTC,"""android|service|usb|background…","[""android"", ""service"", … ""background-process""]"
1,41695757,"""Error encountered while pushin…","""<p>When i try to push any git …",11,2017-01-17 11:18:08.233 UTC,2017-01-17 12:45:51.670 UTC,"""git|visual-studio|visual-studi…","[""git"", ""visual-studio"", … ""tfs-2015""]"
2,41901824,"""Seems like a bug in Azure SQL …","""<p>Below is the script I am ru…",11,2017-01-27 19:42:52.373 UTC,2017-02-25 16:40:22.680 UTC,"""sql|sql-server|azure|azure-sql…","[""sql"", ""sql-server"", … ""data-warehouse""]"
3,42122582,"""Android deep link gives web pa…","""<p>I'm trying to share a link …",11,2017-02-08 20:03:58.070 UTC,2019-12-18 04:20:18.570 UTC,"""android|deep-linking""","[""android"", ""deep-linking""]"
4,42082800,"""segfault when indexing [] into…","""<p>So my unordered map is a ma…",11,2017-02-07 06:02:19.583 UTC,2017-02-07 06:18:36.600 UTC,"""c++""","[""c++""]"
…,…,…,…,…,…,…,…,…
4995,39697065,"""How to display downloaded file…","""<p>In one of my mvc4 applicati…",16,2016-09-26 07:33:20.363 UTC,2016-09-26 09:30:45.770 UTC,"""jquery|asp.net-mvc|asp.net-mvc…","[""jquery"", ""asp.net-mvc"", ""asp.net-mvc-4""]"
4996,39769970,"""Inserting logo image into appl…","""<p>I have been trying and tryi…",16,2016-09-29 12:10:48.987 UTC,2016-09-29 13:13:43.700 UTC,"""android|imageview""","[""android"", ""imageview""]"
4997,39708082,"""What problem or threat does co…","""<p>We use GitHub and we have a…",16,2016-09-26 16:40:35.873 UTC,2018-11-11 12:02:59.420 UTC,"""git|git-commit|sign""","[""git"", ""git-commit"", ""sign""]"
4998,39455252,"""Variable-Length Arrays in c pr…","""<p>I want to create an variabl…",16,2016-09-12 16:59:04.300 UTC,2016-09-12 17:54:25.417 UTC,"""c|arrays|variable-length-array""","[""c"", ""arrays"", ""variable-length-array""]"


In [165]:
df_SO = df_SO.drop(["comment_count", "tags", '']).rename({
    "creation_date": "created_at",
    "last_activity_date": "modified_at",
    "tag_list": "tags",
    "body" : "content"
}
).with_columns(
    path = pl.lit(""),
    file_type = pl.lit("blog"),
    language = pl.lit("en"),
    github_stars = pl.lit(0),
    view_count = pl.lit(0),
    source = pl.lit("Stack Overflow"),
)
df_SO.head(10)

id,title,content,created_at,modified_at,tags,path,file_type,language,github_stars,view_count,source
i64,str,str,"datetime[μs, UTC]","datetime[μs, UTC]",list[str],str,str,str,i32,i32,str
42067644,"""Data sharing (one way communic…","""<p>Consider a data being gener…",2017-02-06 12:18:31.570 UTC,2017-02-06 15:08:30.253 UTC,"[""android"", ""service"", … ""background-process""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
41695757,"""Error encountered while pushin…","""<p>When i try to push any git …",2017-01-17 11:18:08.233 UTC,2017-01-17 12:45:51.670 UTC,"[""git"", ""visual-studio"", … ""tfs-2015""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
41901824,"""Seems like a bug in Azure SQL …","""<p>Below is the script I am ru…",2017-01-27 19:42:52.373 UTC,2017-02-25 16:40:22.680 UTC,"[""sql"", ""sql-server"", … ""data-warehouse""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
42122582,"""Android deep link gives web pa…","""<p>I'm trying to share a link …",2017-02-08 20:03:58.070 UTC,2019-12-18 04:20:18.570 UTC,"[""android"", ""deep-linking""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
42082800,"""segfault when indexing [] into…","""<p>So my unordered map is a ma…",2017-02-07 06:02:19.583 UTC,2017-02-07 06:18:36.600 UTC,"[""c++""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
41808100,"""Read file as hex in C""","""<p>This is my current code:</p…",2017-01-23 13:59:24.080 UTC,2017-01-23 16:05:32.263 UTC,"[""c"", ""file""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
41993460,"""500 Internal Server Error. mal…","""<p>Kindly help me, </p> <p>M…",2017-02-02 02:54:12.757 UTC,2017-02-02 02:54:12.757 UTC,"[""php"", ""apache"", … ""laravel-5.3""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
41973876,"""Strange behavior while calling…","""<p>I am trying to use REFPROPs…",2017-02-01 07:09:25.650 UTC,2017-02-02 09:33:52.140 UTC,"[""fortran"", ""gfortran""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
41816885,"""Copying the lines from a file …","""<p>I'm trying to copy the file…",2017-01-23 22:34:28.947 UTC,2017-01-28 13:55:57.107 UTC,"[""c""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""
41794627,"""Can't create cron job in Googl…","""<p>When I deploy my applicatio…",2017-01-22 18:28:39.233 UTC,2017-03-07 00:20:34.963 UTC,"[""java"", ""google-app-engine"", ""cron""]","""""","""blog""","""en""",0,0,"""Stack Overflow"""


In [166]:
from bs4 import BeautifulSoup
import re

In [167]:
def personal_extractor(text: str):

    if not isinstance(text, str):
        return ""
   
    soup = BeautifulSoup(text, "html.parser")
   
    for pre_tag in soup.find_all("pre"):
        pre_tag.decompose()
    
    for code_tag in soup.find_all("code"):
        code_tag.decompose()
    
    text_content = soup.get_text(separator=" ", strip=True)

    text_content = re.sub(r'\s+', " ", text_content).strip()
    return text_content


In [173]:
df_SO = df_SO.with_columns([
    pl.col("content").map_elements(
        personal_extractor,
        return_dtype=str
    ).alias("raw HTML"),
    pl.col("id").map_elements(
        str,
        return_dtype=str
    )]
).drop(["content"]).rename({"raw HTML": "content"})
df_SO.head(10)

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("id").map_elements(str)
with this one instead:
  + pl.col("id").cast(pl.String)

  pl.col("id").map_elements(

If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(text, "html.parser")


id,title,created_at,modified_at,tags,path,file_type,language,github_stars,view_count,source,content
str,str,"datetime[μs, UTC]","datetime[μs, UTC]",list[str],str,str,str,i32,i32,str,str
"""42067644""","""Data sharing (one way communic…",2017-02-06 12:18:31.570 UTC,2017-02-06 15:08:30.253 UTC,"[""android"", ""service"", … ""background-process""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""Consider a data being generate…"
"""41695757""","""Error encountered while pushin…",2017-01-17 11:18:08.233 UTC,2017-01-17 12:45:51.670 UTC,"[""git"", ""visual-studio"", … ""tfs-2015""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""When i try to push any git com…"
"""41901824""","""Seems like a bug in Azure SQL …",2017-01-27 19:42:52.373 UTC,2017-02-25 16:40:22.680 UTC,"[""sql"", ""sql-server"", … ""data-warehouse""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""Below is the script I am runni…"
"""42122582""","""Android deep link gives web pa…",2017-02-08 20:03:58.070 UTC,2019-12-18 04:20:18.570 UTC,"[""android"", ""deep-linking""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""I'm trying to share a link for…"
"""42082800""","""segfault when indexing [] into…",2017-02-07 06:02:19.583 UTC,2017-02-07 06:18:36.600 UTC,"[""c++""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""So my unordered map is a mappi…"
"""41808100""","""Read file as hex in C""",2017-01-23 13:59:24.080 UTC,2017-01-23 16:05:32.263 UTC,"[""c"", ""file""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""This is my current code: It wo…"
"""41993460""","""500 Internal Server Error. mal…",2017-02-02 02:54:12.757 UTC,2017-02-02 02:54:12.757 UTC,"[""php"", ""apache"", … ""laravel-5.3""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""Kindly help me, My Ubuntu Serv…"
"""41973876""","""Strange behavior while calling…",2017-02-01 07:09:25.650 UTC,2017-02-02 09:33:52.140 UTC,"[""fortran"", ""gfortran""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""I am trying to use REFPROPs su…"
"""41816885""","""Copying the lines from a file …",2017-01-23 22:34:28.947 UTC,2017-01-28 13:55:57.107 UTC,"[""c""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""I'm trying to copy the file it…"
"""41794627""","""Can't create cron job in Googl…",2017-01-22 18:28:39.233 UTC,2017-03-07 00:20:34.963 UTC,"[""java"", ""google-app-engine"", ""cron""]","""""","""blog""","""en""",0,0,"""Stack Overflow""","""When I deploy my application w…"


In [None]:
import json
with open("./test.json", "a") as f:
    df_SO.write_json(f)

In [190]:
df_github = pl.read_parquet("./train-00000-of-00008.parquet")
df_github.head(5)

repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_documentation_string,func_code_url
str,str,str,str,str,str,str,str
"""proycon/pynlpl""","""pynlpl/formats/folia.py""","""AbstractElement.addidsuffix""","""def addidsuffix(self, idsuffix…","""python""","""def addidsuffix(self, idsuffix…","""Appends a suffix to this eleme…","""https://github.com/proycon/pyn…"
"""proycon/pynlpl""","""pynlpl/formats/folia.py""","""AbstractElement.setparents""","""def setparents(self):  …","""python""","""def setparents(self):  …","""Correct all parent relations f…","""https://github.com/proycon/pyn…"
"""proycon/pynlpl""","""pynlpl/formats/folia.py""","""AbstractElement.setdoc""","""def setdoc(self,newdoc):  …","""python""","""def setdoc(self,newdoc):  …","""Set a different document. Usua…","""https://github.com/proycon/pyn…"
"""proycon/pynlpl""","""pynlpl/formats/folia.py""","""AbstractElement.hastext""","""def hastext(self,cls='current'…","""python""","""def hastext(self,cls='current'…","""Does this element have text (o…","""https://github.com/proycon/pyn…"
"""proycon/pynlpl""","""pynlpl/formats/folia.py""","""AbstractElement.hasphon""","""def hasphon(self,cls='current'…","""python""","""def hasphon(self,cls='current'…","""Does this element have phoneti…","""https://github.com/proycon/pyn…"


In [191]:
from datetime import datetime, timezone

df_github = df_github.drop(['func_path_in_repository', "repository_name", "whole_func_string","func_code_string"]).rename({
    "func_documentation_string": "content",
    "func_code_url": "path",
}).with_columns(
    id = "doc_" + pl.col("func_name"),
    tags = pl.col("language").map_elements(lambda lang: [lang] if lang else [], return_dtype=pl.List(pl.String)),
    file_type = pl.lit("repository"),
    language = pl.lit("en"),
    github_stars = pl.lit(0),
    view_count = pl.lit(0),
    source = pl.lit("GitHub"),
    title = "Docstring Documentation of " + pl.col("func_name"),
    created_at = pl.lit(datetime.now(timezone.utc)),
    modified_at = pl.lit(datetime.now(timezone.utc))
).drop(['func_name'])
df_github.head(5)

language,content,path,id,tags,file_type,github_stars,view_count,source,title,created_at,modified_at
str,str,str,str,list[str],str,i32,i32,str,str,"datetime[μs, UTC]","datetime[μs, UTC]"
"""en""","""Appends a suffix to this eleme…","""https://github.com/proycon/pyn…","""doc_AbstractElement.addidsuffi…","[""python""]","""repository""",0,0,"""GitHub""","""Docstring Documentation of Abs…",2025-06-06 13:05:12.578753 UTC,2025-06-06 13:05:12.578820 UTC
"""en""","""Correct all parent relations f…","""https://github.com/proycon/pyn…","""doc_AbstractElement.setparents""","[""python""]","""repository""",0,0,"""GitHub""","""Docstring Documentation of Abs…",2025-06-06 13:05:12.578753 UTC,2025-06-06 13:05:12.578820 UTC
"""en""","""Set a different document. Usua…","""https://github.com/proycon/pyn…","""doc_AbstractElement.setdoc""","[""python""]","""repository""",0,0,"""GitHub""","""Docstring Documentation of Abs…",2025-06-06 13:05:12.578753 UTC,2025-06-06 13:05:12.578820 UTC
"""en""","""Does this element have text (o…","""https://github.com/proycon/pyn…","""doc_AbstractElement.hastext""","[""python""]","""repository""",0,0,"""GitHub""","""Docstring Documentation of Abs…",2025-06-06 13:05:12.578753 UTC,2025-06-06 13:05:12.578820 UTC
"""en""","""Does this element have phoneti…","""https://github.com/proycon/pyn…","""doc_AbstractElement.hasphon""","[""python""]","""repository""",0,0,"""GitHub""","""Docstring Documentation of Abs…",2025-06-06 13:05:12.578753 UTC,2025-06-06 13:05:12.578820 UTC


In [134]:
with open("../sample_data/sample_dataset.json", "a") as f:
    df_github.sample(5000).write_json(f)

In [161]:
set(df_SO.columns).difference(set(df_github.columns))

{''}

In [192]:
final = pl.concat([df_github.sample(5000).select(ORDER), df_SO.select(ORDER)])

In [193]:
with open("../sample_data/sample_dataset.json", "a") as f:
    final.write_json(f)