In [1]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("regularpooria/llm_generated_code_snippets")

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

In [2]:
import json
import ast
import os

In [3]:

def extract_imports(source_code: str):
    tree = ast.parse(source_code)
    libs = []

    for node in ast.walk(tree):
        if isinstance(node, ast.Import):
            for alias in node.names:
                libs.append(alias.name.split('.')[0])

        elif isinstance(node, ast.ImportFrom):
            if node.level == 0 and node.module:
                libs.append(node.module.split('.')[0])

    return sorted(set(lib for lib in libs if lib != "__future__"))


In [4]:
python_rows = list(filter(lambda x: x["language"] == "Python", ds["train"]))
for idx, row in enumerate(python_rows):
    python_rows[idx]["libraries"] = extract_imports(row["code"])

In [5]:
python_rows[0]

{'conversation_hash': '0fff51c4307e569be97a912d71e0d44c',
 'code_index': 0,
 'language': 'Python',
 'libraries': ['selenium'],
 'code': 'from selenium import webdriver\nfrom selenium.webdriver.common.alert import Alert\n\n\ndef handle_alert(browser):\n    try:\n        # Wait for alert to appear\n        alert = browser.switch_to.alert\n        # Accept the alert\n        alert.accept()\n    except Exception as e:\n        # No alert found or any other exception\n        print("No alert found or exception occurred:", e)\n        pass\n\n\n# Create a webdriver instance\ndriver = webdriver.Chrome()\n\n# Navigate to the desired URL\ndriver.get("http://your_website_url.com")\n\n# Call the alert handling function\nhandle_alert(driver)\n\n# Continue to perform your tasks\n# ...\n\n# Close the driver\ndriver.quit()',
 'filename': '0fff51c4307e569be97a912d71e0d44c_0.py'}

In [6]:
libraries = {}
conversation_hashes = {}
for row in python_rows:
    for library in row["libraries"]:
        if library in libraries:
            libraries[library] += 1
        else:
            libraries[library] = 1
            
        if library in conversation_hashes:
            conversation_hashes[library].append(row["conversation_hash"])
        else:
            conversation_hashes[library] = [row["conversation_hash"]]


In [12]:
libraries

{'selenium': 559,
 'cv2': 1409,
 'bs4': 861,
 'os': 3456,
 'requests': 2522,
 'urllib': 246,
 'first_order_model_demo': 1,
 'imageio': 22,
 'numpy': 6155,
 'pandas': 5266,
 'skimage': 63,
 'torch': 1597,
 'torchvision': 181,
 'tqdm': 126,
 'transformers': 669,
 'zipfile': 109,
 'typing': 280,
 'solution': 3,
 'unittest': 53,
 'pytesseract': 74,
 'PyQt5': 571,
 'sys': 1208,
 'googlesearch': 5,
 'pygame': 513,
 'webbrowser': 37,
 'datetime': 1252,
 'json': 1628,
 'plyer': 14,
 'time': 1780,
 'alpha_vantage': 3,
 'sklearn': 2082,
 'gensim': 78,
 'keras': 413,
 'Sastrawi': 10,
 'nltk': 187,
 're': 1152,
 'silero_speech_synthesis': 1,
 'omegaconf': 2,
 'openai_simplified_tts': 1,
 'tts': 1,
 'math': 835,
 'random': 2020,
 'torchtext': 22,
 'socket': 446,
 'threading': 512,
 'base64': 192,
 'asyncio': 495,
 'websockets': 18,
 'soundfile': 34,
 'torchaudio': 51,
 'pydub': 164,
 'gym': 41,
 'collections': 467,
 'matplotlib': 2703,
 'tensorflow': 1105,
 'logging': 305,
 'aws_cdk': 9,
 'scipy': 

In [7]:
os.makedirs("tmp", exist_ok=True)
with open("tmp/python_imports_list_hash.json", "w", encoding="utf-8") as f:
    json.dump(conversation_hashes, f)

In [17]:
os.makedirs("tmp", exist_ok=True)
with open("tmp/python_imports_list.json", "w", encoding="utf-8") as f:
    json.dump(libraries, f)

In [18]:
data = None
with open("tmp/python_imports_list.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [20]:
with open("utils/internal_modules.txt", "r", encoding="utf-8") as f:
    internal_modules = f.read().splitlines()

In [39]:
import re

with open("utils/simple", "r", encoding="utf-8") as f:
    html = f.read()

libs = re.findall(r'<a href="[^"]+">([^<]+)</a>', html)

print(len(libs))
print(libs[:20])


670523
['0', '0-._.-._.-._.-._.-._.-._.-0', '000', '0.0.1', '00101s', '001-hello-uv', '00-merlin-hu-mcpdemo-pipy', '00print_lol', '00SMALINUX', '0101', '01changer', '01-cuda-pybind11', '01d61084-d29e-11e9-96d1-7c5cf84ffe8e', '01-distributions', '01memories', '01OS', '021', '02122Group14', '024travis-test024', '02exercicio']


In [24]:
data = {k: v for k, v in data.items() if k not in internal_modules}


In [25]:
data

{'selenium': 559,
 'cv2': 1409,
 'bs4': 861,
 'requests': 2522,
 'first_order_model_demo': 1,
 'imageio': 22,
 'numpy': 6155,
 'pandas': 5266,
 'skimage': 63,
 'torch': 1597,
 'torchvision': 181,
 'tqdm': 126,
 'transformers': 669,
 'solution': 3,
 'pytesseract': 74,
 'PyQt5': 571,
 'sys': 1208,
 'googlesearch': 5,
 'pygame': 513,
 'plyer': 14,
 'time': 1780,
 'alpha_vantage': 3,
 'sklearn': 2082,
 'gensim': 78,
 'keras': 413,
 'Sastrawi': 10,
 'nltk': 187,
 'silero_speech_synthesis': 1,
 'omegaconf': 2,
 'openai_simplified_tts': 1,
 'tts': 1,
 'math': 835,
 'torchtext': 22,
 'websockets': 18,
 'soundfile': 34,
 'torchaudio': 51,
 'pydub': 164,
 'gym': 41,
 'matplotlib': 2703,
 'tensorflow': 1105,
 'aws_cdk': 9,
 'scipy': 775,
 'fire': 5,
 'peft': 13,
 'PIL': 1092,
 'plotly': 120,
 'streamlit': 226,
 'tweepy': 22,
 'flask': 885,
 'coinbase': 1,
 'wordcloud': 13,
 'open3d': 38,
 'yfinance': 84,
 'textblob': 10,
 'ta': 30,
 'bpy': 132,
 'openpyxl': 340,
 'xlrd': 9,
 'geopy': 20,
 'noise'

In [40]:
def package_exists_on_pypi(pkg_name: str) -> bool:
    """Return True if pkg_name exists on PyPI."""
    return pkg_name in libs
    # url = f"https://pypi.org/pypi/{pkg_name}/json"
    # headers = {"User-Agent": "module-check-script/1.0 (+your-email@example.com)"}
    # resp = requests.get(url, headers=headers)
    # return resp.status_code == 200



In [42]:
package_exists_on_pypi("opencv-python")

True

In [46]:
results = {}

In [47]:

for mod in data:
    if mod in results:
        pass
    exists = package_exists_on_pypi(mod)
    results[mod] = exists
    print(len(data) - len(results))

1984
1983
1982
1981
1980
1979
1978
1977
1976
1975
1974
1973
1972
1971
1970
1969
1968
1967
1966
1965
1964
1963
1962
1961
1960
1959
1958
1957
1956
1955
1954
1953
1952
1951
1950
1949
1948
1947
1946
1945
1944
1943
1942
1941
1940
1939
1938
1937
1936
1935
1934
1933
1932
1931
1930
1929
1928
1927
1926
1925
1924
1923
1922
1921
1920
1919
1918
1917
1916
1915
1914
1913
1912
1911
1910
1909
1908
1907
1906
1905
1904
1903
1902
1901
1900
1899
1898
1897
1896
1895
1894
1893
1892
1891
1890
1889
1888
1887
1886
1885
1884
1883
1882
1881
1880
1879
1878
1877
1876
1875
1874
1873
1872
1871
1870
1869
1868
1867
1866
1865
1864
1863
1862
1861
1860
1859
1858
1857
1856
1855
1854
1853
1852
1851
1850
1849
1848
1847
1846
1845
1844
1843
1842
1841
1840
1839
1838
1837
1836
1835
1834
1833
1832
1831
1830
1829
1828
1827
1826
1825
1824
1823
1822
1821
1820
1819
1818
1817
1816
1815
1814
1813
1812
1811
1810
1809
1808
1807
1806
1805
1804
1803
1802
1801
1800
1799
1798
1797
1796
1795
1794
1793
1792
1791
1790
1789
1788
1787
1786
1785


In [50]:
results

{'selenium': True,
 'cv2': True,
 'bs4': True,
 'requests': True,
 'first_order_model_demo': False,
 'imageio': True,
 'numpy': True,
 'pandas': True,
 'skimage': True,
 'torch': True,
 'torchvision': True,
 'tqdm': True,
 'transformers': True,
 'solution': False,
 'pytesseract': True,
 'PyQt5': True,
 'sys': False,
 'googlesearch': True,
 'pygame': True,
 'plyer': True,
 'time': True,
 'alpha_vantage': False,
 'sklearn': True,
 'gensim': True,
 'keras': True,
 'Sastrawi': True,
 'nltk': True,
 'silero_speech_synthesis': False,
 'omegaconf': True,
 'openai_simplified_tts': False,
 'tts': False,
 'math': False,
 'torchtext': True,
 'websockets': True,
 'soundfile': True,
 'torchaudio': True,
 'pydub': True,
 'gym': True,
 'matplotlib': True,
 'tensorflow': True,
 'aws_cdk': False,
 'scipy': True,
 'fire': True,
 'peft': True,
 'PIL': True,
 'plotly': True,
 'streamlit': True,
 'tweepy': True,
 'flask': False,
 'coinbase': True,
 'wordcloud': True,
 'open3d': True,
 'yfinance': True,
 't

In [51]:
len()

865

In [54]:
for item in list(filter(lambda x: results[x] == True, results)):
    del data[item]

In [57]:
os.makedirs("tmp", exist_ok=True)
with open("tmp/python_imports_list.json", "w", encoding="utf-8") as f:
    json.dump(data, f)