# Generate Data

In [None]:
import requests
import os
import re
import pandas as pd
from urllib.parse import urlparse, quote
from urllib.parse import urlparse

def parse_markdown_to_csv(md_content, csv_file_path):
    heading_pattern = re.compile(r'^(#+)\s*(.*)', re.MULTILINE)
    headings_contents = []
    current_heading = None
    current_content = []
    
    for line in md_content.split('\n'):
        match = heading_pattern.match(line)
        if match:
            if current_heading is not None:
                headings_contents.append([current_heading, ' '.join(current_content).strip()])
            current_heading = match.group(2).strip()
            current_content = []
        else:
            if line.strip():
                current_content.append(line.strip())
    
    if current_heading is not None:
        headings_contents.append([current_heading, ' '.join(current_content).strip()])
    
    df = pd.DataFrame(headings_contents, columns=['Title', 'Content'])
    df.to_csv(csv_file_path, index=False, encoding='utf-8')

def fetch_and_convert_readme_to_csv(repo_urls, output_dir):
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # GitHub API endpoint for fetching the contents of the README file
    for url in repo_urls:
        parsed_url = urlparse(url)
        parts = parsed_url.path.strip('/').split('/')
        repo_user, repo_name = parts[0], parts[1]
        api_url = f"https://api.github.com/repos/{repo_user}/{repo_name}/readme"
        
        # Set up appropriate headers for GitHub API including the token for authorization
        headers = {
            'Accept': 'application/vnd.github.v3.raw',
            'Authorization': 'YOUR_GITHUB_TOKEN'  # Replace 'YOUR_GITHUB_TOKEN' with your actual GitHub token
        }
        
        response = requests.get(api_url, headers=headers)
        if response.status_code == 200:
            readme_content = response.text
            csv_file_path = os.path.join(output_dir, f"{repo_name}.csv")
            parse_markdown_to_csv(readme_content, csv_file_path)
            print(f"Processed {repo_name}.csv")
        else:
            print(f"Failed to fetch README for {repo_name}: {response.status_code}")

# Example usage:
repo_urls = [
    'https://github.com/context-labs/autodoc'
]

fetch_and_convert_readme_to_csv(repo_urls, 'output_csv_files')


In [None]:
import requests
import os
import pandas as pd
import base64
from urllib.parse import urlparse

def fetch_and_concatenate_source_code(repo_urls, output_dir, token):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    headers = {
        'Authorization': f'token {token}',
        'Accept': 'application/vnd.github.v3.raw'  # Requests raw content directly
    }

    for url in repo_urls:
        parsed_url = urlparse(url)
        parts = parsed_url.path.strip('/').split('/')
        repo_user, repo_name = parts[0], parts[1]

        # Fetch the default branch
        repo_info_url = f'https://api.github.com/repos/{repo_user}/{repo_name}'
        repo_info_response = requests.get(repo_info_url, headers=headers)
        if repo_info_response.status_code == 200:
            default_branch = repo_info_response.json()['default_branch']
        else:
            print(f'Failed to fetch repo info for {repo_name}: {repo_info_response.status_code}')
            continue

        api_url = f'https://api.github.com/repos/{repo_user}/{repo_name}/git/trees/{default_branch}?recursive=true'
        response = requests.get(api_url, headers={'Authorization': f'token {token}', 'Accept': 'application/vnd.github.v3+json'})
        
        if response.status_code == 200:
            data = response.json()
            all_files_content = []

            for file in data['tree']:
                if file['type'] == 'blob' and file['path'].endswith(('.py', '.c', '.cpp', '.java', '.js', '.ts', '.go')):
                    file_url = f"https://api.github.com/repos/{repo_user}/{repo_name}/contents/{file['path']}?ref={default_branch}"
                    file_response = requests.get(file_url, headers=headers)
                    if file_response.status_code == 200:
                        file_content = file_response.text
                        all_files_content.append(file_content)

            concatenated_content = "\n".join(all_files_content)
            df = pd.DataFrame([concatenated_content], columns=['SourceCode'])
            df.to_csv(os.path.join(output_dir, f'{repo_name}_context.csv'), index=False)
            print(f'Saved {repo_name}_context.csv')
        else:
            print(f'Failed to fetch repository data for {repo_name}: {response.status_code}')

# Example usage:
repo_urls = [
    "https://github.com/context-labs/autodoc"
]
output_directory = 'output_csv_files'
github_token = 'YOUR_GITHUB_TOKEN'  # Replace with your GitHub access token

fetch_and_concatenate_source_code(repo_urls, output_directory, github_token)


In [None]:
import os
import subprocess
import csv
from pathlib import Path
import shutil
import requests
import os
import re
import pandas as pd
from urllib.parse import urlparse, quote
from urllib.parse import urlparse


# Clone repository to a local path
def git_clone(repo_url, clone_path):
    if os.path.exists(clone_path):
        subprocess.run(['rm', '-rf', clone_path], check=True)
    subprocess.run(['git', 'clone', repo_url, clone_path], check=True)

# Parse the README.md content into a CSV
def parse_markdown_to_csv(md_file_path, csv_file_path):
    with open(md_file_path, 'r', encoding='utf-8') as file:
        md_content = file.read()

    heading_pattern = re.compile(r'^(#+)\s*(.*)', re.MULTILINE)
    headings_contents = []
    current_heading = None
    current_content = []

    for line in md_content.split('\n'):
        match = heading_pattern.match(line)
        if match:
            if current_heading is not None:
                headings_contents.append([current_heading, ' '.join(current_content).strip()])
            current_heading = match.group(2).strip()
            current_content = []
        else:
            if line.strip():
                current_content.append(line.strip())

    if current_heading is not None:
        headings_contents.append([current_heading, ' '.join(current_content).strip()])

    df = pd.DataFrame(headings_contents, columns=['Title', 'Content'])
    df.to_csv(csv_file_path, index=False, encoding='utf-8')

# Process a list of GitHub repository URLs
def process_repos(repo_urls, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for url in repo_urls:
        parsed_url = urlparse(url)
        parts = parsed_url.path.strip('/').split('/')
        repo_user, repo_name = parts[0], parts[1]
        clone_path = f"/tmp/{repo_name}"  # Temporary path for cloning
        git_clone(url, clone_path)

        readme_path = os.path.join(clone_path, 'README.md')
        csv_file_path = os.path.join(output_dir, f"{repo_name}.csv")
        if os.path.exists(readme_path):
            parse_markdown_to_csv(readme_path, csv_file_path)
            print(f"Processed {repo_name}.csv")
        else:
            print(f"README.md not found for {repo_name}")

        # Remove the repository directory to clean up
        subprocess.run(['rm', '-rf', clone_path], check=True)


In [None]:
# Replace this list with your own list of 300 URLs
repo_urls = []
output_directory = 'output_csv_files'
process_repos(repo_urls, output_directory)

In [None]:
import os
import subprocess
import csv
from pathlib import Path
import shutil

In [None]:
# Function to clone a GitHub repository and collect all source code into a single string
def collect_source_code(repo_url):
    # Extract the repo name from the URL
    repo_name = repo_url.rstrip('/').split('/')[-1]
    subprocess.run(['git', 'clone', repo_url], check=True)
    
    # Collect all source code files into a single string
    source_code = []
    for root, dirs, files in os.walk(repo_name):
        for file in files:
            # Filter for source code files only (adjust filters as needed)
            if file.endswith(('.py', '.js', '.java', '.cpp', '.c', '.h', '.html', '.css', '.ts', '.go', '.rb', '.php')):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', errors='ignore') as f:
                    source_code.append(f.read())
                    
    # Join all source code files as one big string
    concatenated_code = "\n".join(source_code)
    
    # Delete the repo after extraction
    shutil.rmtree(repo_name)
    
    return repo_name, concatenated_code


In [None]:
# Directory to store CSV files
output_dir = "github_repo_source_code"
os.makedirs(output_dir, exist_ok=True)

# Create a CSV file per GitHub repo
for url in github_urls:
    try:
        repo_name, concatenated_code = collect_source_code(url)
        csv_file_name = f"{repo_name}.csv"
        csv_file_path = os.path.join(output_dir, csv_file_name)
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow([concatenated_code])
        print(f"Successfully processed and saved {url} to {csv_file_name}")
    except Exception as e:
        print(f"Error processing {url}: {e}")

print("All repositories processed successfully.")


In [None]:
import os
import subprocess
import csv
from pathlib import Path
import shutil
import requests
import os
import re
import pandas as pd
from urllib.parse import urlparse, quote
from urllib.parse import urlparse

# Function to clone a GitHub repository and collect all source code into a single string
def collect_source_code(repo_url):
    # Extract the repo name from the URL
    repo_name = repo_url.rstrip('/').split('/')[-1]
    subprocess.run(['git', 'clone', repo_url], check=True)
    
    # Collect all source code files into a single string
    source_code = []
    for root, dirs, files in os.walk(repo_name):
        for file in files:
            # Filter for source code files only (adjust filters as needed)
            if file.endswith(('.py', '.js', '.java', '.cpp', '.c', '.h', '.html', '.css', '.ts', '.go', '.rb', '.php')):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', errors='ignore') as f:
                    source_code.append(f.read())
                    
    # Join all source code files as one big string
    concatenated_code = "\n".join(source_code)
    print(type(concatenated_code))
    
    # Delete the repo after extraction
    shutil.rmtree(repo_name)
    
    return repo_name, concatenated_code



In [None]:
# Directory to store CSV files
output_dir = "github_repo_source_code"
os.makedirs(output_dir, exist_ok=True)
for url in github_urls:
    try:
        repo_name, concatenated_code = collect_source_code(url)
        txt_file_name = f"{repo_name}.txt"
        txt_file_name = os.path.join(output_dir, txt_file_name)
        with open(txt_file_name, 'w', encoding='utf-8') as txt_file:
            txt_file.write(concatenated_code)
        print(f"Successfully processed and saved {url} to {txt_file_name}")
    except Exception as e:
        print(f"Error processing {url}: {e}")

print("All repositories processed successfully.")

# HNSWLIB Context Generation

In [None]:
%pip install hnswlib sentence_transformers langchain_text_splitters langdetect

In [None]:
import hnswlib
import numpy as np

def get_context(sentences, embeds, question_embed):
    dim = embeds.shape[1]
    num_elements = embeds.shape[0]

    # Generating sample data
    data = embeds
    ids = np.arange(num_elements)

    # Declaring index
    p = hnswlib.Index(space = 'cosine', dim = dim) # possible options are l2, cosine or ip

    # Initializing index - the maximum number of elements should be known beforehand
    p.init_index(max_elements = num_elements, ef_construction = 200, M = 16)

    # Element insertion (can be called several times):
    p.add_items(data, ids)

    # Controlling the recall by setting ef:
    p.set_ef(50) # ef should always be > k

    # Query dataset, k - number of the closest elements (returns 2 numpy arrays)
    labels, distances = p.knn_query(question_embed, k = 4)

    return "".join([sentences[index] for index in labels[0]])

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
import pandas as pd
import os
import pickle

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=100
)

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device = 'cuda:0')

root_dir = "./"
context_root_dir = "./github_repo_source_code/"
readme_root_dir = "./output_csv_files/"

with open('./repo_urls.pickle', 'rb') as f:
    repo_name_list = pickle.load(f)

new_rows = []
for repo in repo_name_list:
    repo_name = repo.split("/")[-1]
    file1 = repo_name +".txt"
    with open(os.path.join(context_root_dir, file1)) as f:
        data = f.read()
    sentences = text_splitter.split_text(data)
    embeddings = model.encode(sentences)
    print(embeddings.shape)

    file2 = repo_name +".csv"
    df2 = pd.read_csv(os.path.join(readme_root_dir, file2))
    for i, row in df2.iterrows():
        title = row["Title"]
        content = row["Content"]
        if "?" in title:
            question = f"In context to the project {repo_name}, answer the following. " + title
            question_embedding = model.encode([question])
            context = get_context(sentences, embeddings, question_embedding)
            new_row  = {"Question": question, "Context": context, "Answer": content, "Repo Url": repo, "Repo": repo_name}
            new_rows.append(new_row)
        else:
            question = f"Provide the README content for the section with heading \"{title}\" starting with ## {title}."
            question_embedding = model.encode([question])
            context = get_context(sentences, embeddings, question_embedding)
            new_row  = {"Question": question, "Context": context, "Answer": content, "Repo Url": repo, "Repo": repo_name}
            new_rows.append(new_row)
    print(len(new_rows))
    df3 = pd.DataFrame(new_rows, index=None)
    df3.to_csv(os.path.join(root_dir, "readme_qa.csv"), mode="a")

# Clean Data

In [13]:
def remove_urls(text):
  """Remove URLs from a given text string."""
  url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
  return re.sub(url_pattern, '', text)

def remove_html_tags(text):
  """Remove HTML tags from a given text string."""
  html_pattern = r'<.*?>'
  return re.sub(html_pattern, '', text)

In [14]:
def clean_text(text):
    # Define the regular expression pattern for HTTP URLs
    http_pattern = re.compile(r'http://[^\s]+')
    # Remove HTTP URLs
    text = http_pattern.sub('', str(text))

    https_pattern = re.compile(r'https://[^\s]+')
    # Remove HTTPS URLs
    text = https_pattern.sub('', str(text))
    
    # Define the regular expression pattern for <img> tags
    img_pattern = re.compile(r'<img[^>]*>')
    # Remove <img> tags
    text = img_pattern.sub('', str(text))
    
    return text

In [12]:
import re
def clean_emoji(tx):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols 
                           u"\U0001F680-\U0001F6FF"  # transport 
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', tx)

def text_cleaner(tx):

    text = re.sub(r"won\'t", "would not", tx)
    text = re.sub(r"im", "i am", tx)
    text = re.sub(r"Im", "I am", tx)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"don\'t", "do not", text)
    text = re.sub(r"shouldn\'t", "should not", text)
    text = re.sub(r"needn\'t", "need not", text)
    text = re.sub(r"hasn\'t", "has not", text)
    text = re.sub(r"haven\'t", "have not", text)
    text = re.sub(r"weren\'t", "were not", text)
    text = re.sub(r"mightn\'t", "might not", text)
    text = re.sub(r"didn\'t", "did not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    # text = re.sub('https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'https?://[^\s\")]+', '', text)
    text = re.sub(r'http?://[^\s\")]+', '', text)
    text = re.sub(r'http%3A%2F%2F[^\s\")]+', '', text)
    text = re.sub(r'https%3A%2F%2F[^\s\")]+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\!\?\.\@]',' ' , text)
    text = re.sub(r'[!]+' , '!' , text)
    text = re.sub(r'[?]+' , '?' , text)
    text = re.sub(r'[.]+' , '.' , text)
    text = re.sub(r'[@]+' , '@' , text)
    text = re.sub(r'unk' , '<UNK>' , text)
    # text = re.sub('\n', '<NL>', text)
    # text = re.sub('\t', '<TAB>', text)
    # text = re.sub(r'\s+', '<SP>', text)
    # text = re.sub(r'(<img[^>]*\bsrc=")[^"]*(")', '<img src=<IMG_SRC>', text)
    
    text = text.lower()
    text = re.sub(r'[ ]+' , ' ' , text)

    return text

In [3]:
import pandas as pd
df = pd.read_csv("readme_qa.csv")
df.columns = [str(q).strip() for q in df.columns]

In [6]:
df["Question"].values[0:5]

array(['Provide the README content for the section with heading "Try Public APIs for free" starting with ## Try Public APIs for free.',
       'Provide the README content for the section with heading "APILayer APIs" starting with ## APILayer APIs.',
       'Provide the README content for the section with heading "Popular APIs" starting with ## Popular APIs.',
       'Provide the README content for the section with heading "Popular categories" starting with ## Popular categories.',
       'Provide the README content for the section with heading "Learn more about Public APIs" starting with ## Learn more about Public APIs.'],
      dtype=object)

In [4]:
df["Answer"].values[0:5]

array(['Explore popular APIs and see them work in Postman. <br > <p> <a href="https://apilayer.com"> <div> <img src=".github/cs1586-APILayerLogoUpdate2022-LJ_v2-HighRes.png" width="250" alt="APILayer Logo" /> </div> </a> </p> [APILayer](https://apilayer.com/) is the fastest way to integrate APIs into any product. They created this repository to support the community in easily finding public APIs. Explore their collections on the [Postman API Network](https://www.postman.com/apilayer/workspace/apilayer/overview).',
       '| API | Description | Call this API | |:---|:---|:---| | [IP Stack](https://ipstack.com/) | Locate and Identify Website Visitors by IP Address | [<img src="https://run.pstmn.io/button.svg" alt="Run In Postman" style="width: 128px; height: 32px;">](https://god.gw.postman.com/run-collection/10131015-55145132-244c-448c-8e6f-8780866e4862?action=collection%2Ffork&source=rip_markdown&collection-url=entityId%3D10131015-55145132-244c-448c-8e6f-8780866e4862%26entityType%3Dcoll

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Question,Context,Answer,Repo Url,Repo
0,0.0,Provide the README content for the section wit...,Discussions in issues and pull requests:\n ...,Explore popular APIs and see them work in Post...,https://github.com/public-apis/public-apis,public-apis
1,1.0,Provide the README content for the section wit...,Discussions in issues and pull requests:\n ...,| API | Description | Call this API | |:---|:-...,https://github.com/public-apis/public-apis,public-apis
2,2.0,Provide the README content for the section wit...,Discussions in issues and pull requests:\n ...,| API | Description | Auth | Call this API | |...,https://github.com/public-apis/public-apis,public-apis
3,3.0,Provide the README content for the section wit...,# check each category for the minimum number o...,* [Animals](#animals) * [Anime](#anime) * [Art...,https://github.com/public-apis/public-apis,public-apis
4,4.0,Provide the README content for the section wit...,Discussions in issues and pull requests:\n ...,<br > <strong>Get Involved</strong> * [Contrib...,https://github.com/public-apis/public-apis,public-apis


In [7]:
import numpy as np
df.replace('', np.nan, inplace=True)
df.dropna(subset=["Answer"], inplace=True)
df = df[["Question", "Context", "Answer", "Repo Url", "Repo"]]
df.head()

Unnamed: 0,Question,Context,Answer,Repo Url,Repo
0,Provide the README content for the section wit...,Discussions in issues and pull requests:\n ...,Explore popular APIs and see them work in Post...,https://github.com/public-apis/public-apis,public-apis
1,Provide the README content for the section wit...,Discussions in issues and pull requests:\n ...,| API | Description | Call this API | |:---|:-...,https://github.com/public-apis/public-apis,public-apis
2,Provide the README content for the section wit...,Discussions in issues and pull requests:\n ...,| API | Description | Auth | Call this API | |...,https://github.com/public-apis/public-apis,public-apis
3,Provide the README content for the section wit...,# check each category for the minimum number o...,* [Animals](#animals) * [Anime](#anime) * [Art...,https://github.com/public-apis/public-apis,public-apis
4,Provide the README content for the section wit...,Discussions in issues and pull requests:\n ...,<br > <strong>Get Involved</strong> * [Contrib...,https://github.com/public-apis/public-apis,public-apis


In [10]:
from langdetect import detect
df['detect'] = detect(str(df['Answer']))
df.head()

Unnamed: 0,Question,Context,Answer,Repo Url,Repo,detect
0,Provide the README content for the section wit...,Discussions in issues and pull requests:\n ...,Explore popular APIs and see them work in Post...,https://github.com/public-apis/public-apis,public-apis,en
1,Provide the README content for the section wit...,Discussions in issues and pull requests:\n ...,| API | Description | Call this API | |:---|:-...,https://github.com/public-apis/public-apis,public-apis,en
2,Provide the README content for the section wit...,Discussions in issues and pull requests:\n ...,| API | Description | Auth | Call this API | |...,https://github.com/public-apis/public-apis,public-apis,en
3,Provide the README content for the section wit...,# check each category for the minimum number o...,* [Animals](#animals) * [Anime](#anime) * [Art...,https://github.com/public-apis/public-apis,public-apis,en
4,Provide the README content for the section wit...,Discussions in issues and pull requests:\n ...,<br > <strong>Get Involved</strong> * [Contrib...,https://github.com/public-apis/public-apis,public-apis,en


In [11]:
df = df[df['detect'] == 'en']
df = df[["Question", "Context", "Answer", "Repo Url", "Repo"]]
len(df)

12803

In [15]:
# df["Answer"] = df["Answer"].apply(clean_text)
df["Answer"] = df["Answer"].apply(text_cleaner)
df["Answer"] = df["Answer"].apply(clean_emoji)
#df["Context"] = df["Context"].apply(text_cleaner)
df["Answer"].values[0:5]

array(['explore popular apis and see them work in postman. br p a href div img src .github cs1586 apilayerlogoupdate2022 lj v2 highres.png width 250 alt apilayer logo div a p apilayer is the fastest way to integrate apis into any product. they created this repository to support the community in easily finding public apis. explore their collections on the postman api network .',
       ' api description call this api ip stack locate and identify website visitors by ip address img src alt run in postman style width 128px height 32px marketstack free easy to use rest api interface delivering worldwide stock market data in json format img src alt run in postman style width 128px height 32px weatherstack retrieve instant accurate weather information for any location in the world in lightweight json format img src alt run in postman style width 128px height 32px numverify global phone number validation lookup json api img src alt run in postman style width 128px height 32px fixer fixer is a 

In [16]:
df["Context"].values[0:5]

array(['Discussions in issues and pull requests:\n        - https://github.com/public-apis/public-apis/pull/2409\n        - https://github.com/public-apis/public-apis/issues/2960 \n    """\n\n    code = resp.status_code\n    server = resp.headers.get(\'Server\') or resp.headers.get(\'server\')\n    cloudflare_flags = [\n        \'403 Forbidden\',\n        \'cloudflare\',\n        \'Cloudflare\',\n        \'Security check\',\n        \'Please Wait... | Cloudflare\',\n        \'We are checking your browser...\',\n        \'Please stand by, while we are checking your browser...\',\n        \'Checking your browser before accessing\',\n        \'This process is automatic.\',\n        \'Your browser will redirect to your requested content shortly.\',\n        \'Please allow up to 5 seconds\',\n        \'DDoS protection by\',\n        \'Ray ID:\',\n        \'Cloudflare Ray ID:\',\n        \'_cf_chl\',\n        \'_cf_chl_opt\',\n        \'__cf_chl_rt_tk\',\n        \'cf-spinner-please-wait\',\

In [None]:
options = ['allennlp', 'autojump', 'typer', 'spotify-downloader', 'spleeter', 'python-fire', 'numpy-ml', 'magenta'] 
   
# selecting rows based on condition 
df = df[df['Repo'].isin(options)]
len(df)

In [None]:
df["Answer"].values[-50:]

In [None]:
df.to_csv("scripts/readme_qa.csv", index =False)

# Scoring

In [None]:
from pprint import pprint
# from torchmetrics.text.bert import BERTScore
import bert_score
import re

with open("README_LLAMA2_7B_CHAT_GPTQ.md", 'r', encoding='utf-8') as f:
    pred = f.read()
with open("spleeter/README.md", 'r', encoding='utf-8') as f:
    target = f.read()

pred = re.sub(r' +', ' ', pred)
target = re.sub(r' +', ' ', target)
P, R, F1 = bert_score.score([pred], [target], lang='en', model_type='roberta-large', verbose=True)
print(P,R,F1)

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import AutoTokenizer

# Download the required NLTK data
# nltk.download('punkt')

pred = ""
target = ""
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7b-Chat-GPTQ")
tokenizer.pad_token = tokenizer.eos_token

def calculate_bleu(reference, candidate):
    reference_tokens = tokenizer.tokenize(reference)
    candidate_tokens = tokenizer.tokenize(candidate)
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=smoothie)

bleu_score = calculate_bleu(target, pred)
print(bleu_score)

In [None]:
from pymarkdown.api import PyMarkdownApi

source_path = "README_LLAMA2_7B_CHAT_GPTQ.md"
errors = PyMarkdownApi().scan_path(source_path)
print(len(errors.scan_failures))

# Markdown to JSON

In [None]:
%pip install markdown-to-json

In [None]:
import markdown_to_json
md_file_path = "/content/README.md"
with open(md_file_path, 'r', encoding='utf-8') as file:
    md_content = file.read()

# The simple way:
dictified = markdown_to_json.dictify(md_content)
dictified

In [None]:
print(dictified['Try Public APIs for free']['Index']['Anime'])