In [8]:
pip install sklearn

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import uuid
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from langchain.document_loaders import DirectoryLoader, NotebookLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import clean_and_tokenize

[nltk_data] Downloading package punkt to /home/spanidea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)

In [28]:
def load_and_index_files(repo_path):
    extensions = ['txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php', 'scala', 'html', 'htm', 'xml', 'json', 'yaml', 'yml', 'ini', 'toml', 'cfg', 'conf', 'sh', 'bash', 'css', 'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 'ipynb']

    file_type_counts = {}
    documents_dict = {}

    for ext in extensions:
        glob_pattern = f'**/*.{ext}'
        #print(glob_pattern)
        try:
            loader = None
            if ext == 'ipynb':
                loader = NotebookLoader(str(repo_path), include_outputs=True, max_output_length=20, remove_newline=True)
            else:
                loader = DirectoryLoader(repo_path, glob=glob_pattern)

            loaded_documents = loader.load() if callable(loader.load) else []
            print(loaded_documents)
            print('1')
            if loaded_documents:
                file_type_counts[ext] = len(loaded_documents)
                for doc in loaded_documents:
                    file_path = doc.metadata['source']
                    relative_path = os.path.relpath(file_path, repo_path)
                    file_id = str(uuid.uuid4())
                    doc.metadata['source'] = relative_path
                    doc.metadata['file_id'] = file_id

                    documents_dict[file_id] = doc
            print('2')
        except Exception as e:
            print(f"Error loading files with pattern '{glob_pattern}': {e}")
            continue

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)

    split_documents = []
    for file_id, original_doc in documents_dict.items():
        split_docs = text_splitter.split_documents([original_doc])
        for split_doc in split_docs:
            split_doc.metadata['file_id'] = original_doc.metadata['file_id']
            split_doc.metadata['source'] = original_doc.metadata['source']

        split_documents.extend(split_docs)

    index = None
    if split_documents:
        tokenized_documents = [clean_and_tokenize(doc.page_content) for doc in split_documents]
        index = BM25Okapi(tokenized_documents)
    return index, split_documents, file_type_counts, [doc.metadata['source'] for doc in split_documents]

In [29]:
def clone_github_repo(github_url, local_path):
    try:
        subprocess.run(['git', 'clone', github_url, local_path], check=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Failed to clone repository: {e}")
        return False


In [30]:
import tempfile
github_url='https://github.com/riffs14/blog.git' 


with tempfile.TemporaryDirectory() as local_path:
            #print(clone_github_repo(github_url, local_path))
            #input()
            if clone_github_repo(github_url, local_path):
                print("local : ",local_path)
                #input()
                index, documents, file_type_counts, filenames = load_and_index_files(local_path)

Cloning into '/tmp/tmpnmtcesbk'...
The MIME type of '/tmp/tmpnmtcesbk/src/app/app.component.css' is 'inode/x-empty'. This file type is not currently supported in unstructured.


local :  /tmp/tmpnmtcesbk
[]
1
2
[Document(page_content='Blog\n\nThis project was generated with Angular CLI version 11.0.1.\n\nDevelopment server\n\nRun ng serve for a dev server. Navigate to http://localhost:4200/. The app will automatically reload if you change any of the source files.\n\nCode scaffolding\n\nRun ng generate component component-name to generate a new component. You can also use ng generate directive|pipe|service|class|guard|interface|enum|module.\n\nBuild\n\nRun ng build to build the project. The build artifacts will be stored in the dist/ directory. Use the --prod flag for a production build.\n\nRunning unit tests\n\nRun ng test to execute the unit tests via Karma.\n\nRunning end-to-end tests\n\nRun ng e2e to execute the end-to-end tests via Protractor.\n\nFurther help\n\nTo get more help on the Angular CLI use ng help or go check out the Angular CLI Overview and Command Reference page.', metadata={'source': '/tmp/tmpnmtcesbk/README.md'})]
1
2
[]
1
2
[]
1
2
[]
1
2
[

In [6]:
# !pip install curl

In [7]:
# tt=!curl https://api.github.com/users/riffs14/repos
import os
result = os.popen("curl https://api.github.com/users/riffs14/repos ").read()
#ruby -rubygems -e 'require "json"; JSON.load(STDIN.read).each { |repo| %x[git clone #{repo["ssh_url"]} ]}'
result

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  151k    0  151k    0     0   132k      0 --:--:--  0:00:01 --:--:--  132k


'[\n  {\n    "id": 345369398,\n    "node_id": "MDEwOlJlcG9zaXRvcnkzNDUzNjkzOTg=",\n    "name": "blog",\n    "full_name": "riffs14/blog",\n    "private": false,\n    "owner": {\n      "login": "riffs14",\n      "id": 41971843,\n      "node_id": "MDQ6VXNlcjQxOTcxODQz",\n      "avatar_url": "https://avatars.githubusercontent.com/u/41971843?v=4",\n      "gravatar_id": "",\n      "url": "https://api.github.com/users/riffs14",\n      "html_url": "https://github.com/riffs14",\n      "followers_url": "https://api.github.com/users/riffs14/followers",\n      "following_url": "https://api.github.com/users/riffs14/following{/other_user}",\n      "gists_url": "https://api.github.com/users/riffs14/gists{/gist_id}",\n      "starred_url": "https://api.github.com/users/riffs14/starred{/owner}{/repo}",\n      "subscriptions_url": "https://api.github.com/users/riffs14/subscriptions",\n      "organizations_url": "https://api.github.com/users/riffs14/orgs",\n      "repos_url": "https://api.github.com/users

In [3]:
result


''

In [8]:
result
import json
json_object = json.loads(result)

In [9]:
json_object[0]['visibility']

'public'

In [10]:
json_object[0]['html_url']

'https://github.com/riffs14/blog'

In [17]:
for count,i in enumerate(json_object):
    print(count)
    if i['visibility']=='public':
        print(i['html_url'])

0
https://github.com/riffs14/blog
1
https://github.com/riffs14/Computer_Vision
2
https://github.com/riffs14/demo-repo
3
https://github.com/riffs14/discover
4
https://github.com/riffs14/FirstProject
5
https://github.com/riffs14/fork
6
https://github.com/riffs14/Game15
7
https://github.com/riffs14/geckodriver
8
https://github.com/riffs14/GeeksForGeeks
9
https://github.com/riffs14/git_totorial
10
https://github.com/riffs14/google-research
11
https://github.com/riffs14/image_and_files
12
https://github.com/riffs14/LeetCode
13
https://github.com/riffs14/M.Tech_MTP
14
https://github.com/riffs14/mega-front
15
https://github.com/riffs14/ML
16
https://github.com/riffs14/ML-2
17
https://github.com/riffs14/NLU
18
https://github.com/riffs14/nodekb
19
https://github.com/riffs14/Portfolio
20
https://github.com/riffs14/proctor
21
https://github.com/riffs14/pytorch-examples
22
https://github.com/riffs14/pytorchTutorial
23
https://github.com/riffs14/Real-Time-Voice-Cloning
24
https://github.com/riffs14

In [16]:
count

27