In [10]:
import pyodbc
import requests
import pandas as pd

# Connection string
# Enters your database 'SERVER' name:
conn_str = 'DRIVER={ODBC Driver 18 for SQL Server};' \
           'SERVER=QUOCBUI-PERSONA\\MSSQLSERVER01;' \
           'DATABASE=FixFox;' \
           'Connection Timeout=300;' \
           'Login Timeout=300;' \
           'LongAsMax=yes;' \
           'TrustServerCertificate=yes;' \
           'Trusted_Connection=yes;'

def print_raw_file_links_by_changeset_hash(input_changeset_hash):
    global conn_str
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()

    try:
        cursor.execute('''  
            SELECT cf.Previous_File_Name,
                cf.Updated_File_Name,
                c.Mercurial_Type,
                cf.Changeset_Hash_ID,
                c.Parent_Hashes
            FROM Changeset_Files cf
            INNER JOIN Changesets c ON c.Hash_Id = cf.Changeset_Hash_ID
            WHERE cf.Changeset_Hash_ID = ?;
        ''', input_changeset_hash)

        records = cursor.fetchall()
    finally:
        cursor.close()
        conn.close()

    link_format = "https://hg.mozilla.org/{mercurial_type}/raw-file/{changeset_hash_id}/{file_path}"

    def generate_file_link(mercurial_type, changeset_hash_id, file_path):
        return link_format.format(
            mercurial_type=mercurial_type,
            changeset_hash_id=changeset_hash_id,
            file_path=file_path[2:]  # Remove "a/" or "b/" prefixes
        )

    count = 1
    for record in records:
        prev_file, updated_file, mercurial_type, changeset_hash, parent_hash = record
        
        mercurial_types = mercurial_type.split(' | ')
        print(f"File {str(count)} - Previous File: '{prev_file}' && Updated File: '{updated_file}'")
        print(f"File {str(count)} - List of Raw File Links for Previous File:")
        if prev_file != '/dev/null':
            for mercurial in mercurial_types:
                print(f"[{mercurial}]: {generate_file_link(mercurial, changeset_hash, prev_file)}")
        else:
            print("- No available")

        print(f"File {str(count)} - List of Raw File Links for Updated File:")
        if updated_file != '/dev/null':
            for mercurial in mercurial_types:
                print(f"[{mercurial}]: {generate_file_link(mercurial, changeset_hash, updated_file)}")
        else:
            print("- No available")
        print("\n")
        count += 1

    if len(records) == 0:
        print(f"No files associated with the changeset: {input_changeset_hash}")


def extract_file_content(changeset_hash_id, mercurial_type, fully_qualified_filename):
    sanitized_filename = fully_qualified_filename[2:] if fully_qualified_filename.startswith(("a/", "b/")) else fully_qualified_filename

    link_format = "https://hg.mozilla.org/{mercurial_type}/raw-file/{changeset_hash_id}/{file_path}"
    url = link_format.format(
        mercurial_type=mercurial_type,
        changeset_hash_id=changeset_hash_id,
        file_path=sanitized_filename
    )

    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch file content from {url}: {e}")
        return None


def get_file_type_distribution():
    global conn_str

    query = '''
    -- Get file type statistics
    SELECT 
        File_Type,
        Total_Count,
        ROUND((CAST(Total_Count AS FLOAT) / (SELECT COUNT(*) FROM Changeset_Files WHERE 
            (Previous_File_Name LIKE '%.js' OR Previous_File_Name LIKE '%.py' OR Previous_File_Name LIKE '%.c' OR Previous_File_Name LIKE '%.cpp')
            OR 
            (Updated_File_Name LIKE '%.js' OR Updated_File_Name LIKE '%.py' OR Updated_File_Name LIKE '%.c' OR Updated_File_Name LIKE '%.cpp')
        )) * 100, 2) AS Percentage
    FROM (
        SELECT 
            'JavaScript (.js)' AS File_Type, 
            COUNT(*) AS Total_Count
        FROM Changeset_Files
        WHERE 
            (Previous_File_Name LIKE '%.js' OR Updated_File_Name LIKE '%.js')
        UNION ALL
        SELECT 
            'Python (.py)' AS File_Type, 
            COUNT(*) AS Total_Count
        FROM Changeset_Files
        WHERE 
            (Previous_File_Name LIKE '%.py' OR Updated_File_Name LIKE '%.py')
        UNION ALL
        SELECT 
            'C (.c)' AS File_Type, 
            COUNT(*) AS Total_Count
        FROM Changeset_Files
        WHERE 
            (Previous_File_Name LIKE '%.c' OR Updated_File_Name LIKE '%.c')
        UNION ALL
        SELECT 
            'C++ (.cpp)' AS File_Type, 
            COUNT(*) AS Total_Count
        FROM Changeset_Files
        WHERE 
            (Previous_File_Name LIKE '%.cpp' OR Updated_File_Name LIKE '%.cpp')
    ) AS File_Stats
    ORDER BY Percentage DESC;
    '''

    try:
        conn = pyodbc.connect(conn_str)
        cursor = conn.cursor()
        try:
            df = pd.read_sql_query(query, conn)
        finally:
            cursor.close()
        conn.close()
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


def get_changeset_files(changeset_hash):
    global conn_str

    try:
        conn = pyodbc.connect(conn_str)
        cursor = conn.cursor()
        try:
            cursor.execute('''
                SELECT 
                    Previous_File_Name, 
                    Updated_File_Name, 
                    File_Status
                FROM Changeset_Files
                WHERE Changeset_Hash_ID = ?;
                ''', (changeset_hash,))
            records = cursor.fetchall()
        finally:
            cursor.close()
        conn.close()

        if not records:
            print(f"No records found for Changeset Hash ID: {changeset_hash}")
            return None
        else:
            df = pd.DataFrame.from_records(
                records, 
                columns=["Previous File Name", "Updated File Name", "File Status"]
            )
            return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
def get_list_of_changesets(bug_id):
    global conn_str

    try:
        conn = pyodbc.connect(conn_str)
        cursor = conn.cursor()
        try:
            cursor.execute('''
                SELECT Changeset_Hash_ID,
                       Type
                FROM Changeset_BugMapping
                WHERE Bug_ID = ?;
                ''', (bug_id,))
            records = cursor.fetchall()
        finally:
            cursor.close()
        conn.close()

        if not records:
            print(f"No records found for Changeset Bug ID: {bug_id}")
            return None
        else:
            # Create a DataFrame
            df = pd.DataFrame.from_records(
                records,
                columns=["Changeset Hash ID", "Type"]
            )

            # Pivot data to create two new columns
            df["Bug ID mentioned in Changeset Summary"] = df["Type"].apply(lambda x: "X" if x == "InTitle" else "")
            df["Changeset mentioned in Bug Comments"] = df["Type"].apply(lambda x: "X" if x == "InComment" else "")

            # Combine rows with the same Changeset Hash ID
            df = df.groupby("Changeset Hash ID", as_index=False).agg({
                "Bug ID mentioned in Changeset Summary": "max",
                "Changeset mentioned in Bug Comments": "max"
            })

            return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None




How to formulate URL links to access raw file contents associated with a specific changeset hash?

In [13]:
# Print out the list of raw file links based on the input changeset hash
changeset_hash_id = "010f3ed252635ba277844b24ed7b073592d5e8df"
print_raw_file_links_by_changeset_hash(changeset_hash_id)

File 1 - Previous File: '/dev/null' && Updated File: 'b/browser/components/extensions/test/browser/browser_ext_contextMenus_srcUrl_redirect.js'
File 1 - List of Raw File Links for Previous File:
- No available
File 1 - List of Raw File Links for Updated File:
[mozilla-central]: https://hg.mozilla.org/mozilla-central/raw-file/010f3ed252635ba277844b24ed7b073592d5e8df/browser/components/extensions/test/browser/browser_ext_contextMenus_srcUrl_redirect.js
[integration/autoland]: https://hg.mozilla.org/integration/autoland/raw-file/010f3ed252635ba277844b24ed7b073592d5e8df/browser/components/extensions/test/browser/browser_ext_contextMenus_srcUrl_redirect.js


File 2 - Previous File: 'a/browser/base/content/nsContextMenu.js' && Updated File: 'b/browser/base/content/nsContextMenu.js'
File 2 - List of Raw File Links for Previous File:
[mozilla-central]: https://hg.mozilla.org/mozilla-central/raw-file/010f3ed252635ba277844b24ed7b073592d5e8df/browser/base/content/nsContextMenu.js
[integration/aut

How to extract a specific file content?

In [14]:
# Define changeset hash and mercurial type:
changeset_hash_id = "010f3ed252635ba277844b24ed7b073592d5e8df"
mercurial_type = "mozilla-central"

# Example of print out file content:
fully_qualified_filename_a = "a/browser/base/content/nsContextMenu.js"
file_content_a = extract_file_content(changeset_hash_id, mercurial_type, fully_qualified_filename_a)
print(file_content_a) # Print out file content


# Example of saving file content to .txt:
fully_qualified_filename_b = "b/browser/base/content/nsContextMenu.js"
file_content_b = extract_file_content(changeset_hash_id, mercurial_type, fully_qualified_filename_b)
if file_content_b:
    # Simplify the filename for saving
    simplified_filename = fully_qualified_filename_b.replace("/", "_").replace("\\", "_")
    output_path = f"./{simplified_filename}.txt"

    # Write to file
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(file_content_b)
        print(f"Content saved to {output_path}")
else:
    print("Failed to fetch file content.")

/* -*- tab-width: 2; indent-tabs-mode: nil; js-indent-level: 2 -*- */
/* vim: set ts=2 sw=2 sts=2 et tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

const PASSWORD_FIELDNAME_HINTS = ["current-password", "new-password"];
const USERNAME_FIELDNAME_HINT = "username";

function openContextMenu(aMessage, aBrowser, aActor) {
  if (BrowserHandler.kiosk) {
    // Don't display context menus in kiosk mode
    return;
  }
  let data = aMessage.data;
  let browser = aBrowser;
  let actor = aActor;
  let spellInfo = data.spellInfo;
  let frameReferrerInfo = data.frameReferrerInfo;
  let linkReferrerInfo = data.linkReferrerInfo;
  let principal = data.principal;
  let storagePrincipal = data.storagePrincipal;

  let documentURIObject = makeURI(
    data.docLocation,
    data.charSet,
    makeURI(data.baseURI)
  );

  if (frameReferrer

How do I get the distribution of file types in the dataset?

In [16]:
# Print out File Type Distribution:
df = get_file_type_distribution()
df.index = range(1, len(df) + 1)
df.style.set_caption("File Type Distribution").format({'Percentage': "{:.2f}%"})

  df = pd.read_sql_query(query, conn)


Unnamed: 0,File_Type,Total_Count,Percentage
1,C++ (.cpp),7987,68.55%
2,JavaScript (.js),2481,21.29%
3,C (.c),1024,8.79%
4,Python (.py),159,1.36%


How to get the list of files associated with a specific changeset/commit?

In [17]:
changeset_hash = "01c3e1ae707de16d3e72bf5a9cfa8bf26d85fec3"
df = get_changeset_files(changeset_hash)

if df is not None:
    df.index = range(1, len(df) + 1)
    styled_df = df.style.set_caption(f"Files in Changeset: {changeset_hash}")
    display(styled_df)  # Use IPython's display function to render in Jupyter Notebook

Unnamed: 0,Previous File Name,Updated File Name,File Status
1,a/browser/components/newtab/prerendered/locales/bg/activity-stream-strings.js,b/browser/components/newtab/prerendered/locales/bg/activity-stream-strings.js,modified
2,a/browser/components/newtab/prerendered/locales/an/activity-stream-strings.js,b/browser/components/newtab/prerendered/locales/an/activity-stream-strings.js,modified
3,a/browser/components/newtab/prerendered/locales/kk/activity-stream-strings.js,b/browser/components/newtab/prerendered/locales/kk/activity-stream-strings.js,modified
4,a/browser/components/newtab/prerendered/locales/cy/activity-stream-strings.js,b/browser/components/newtab/prerendered/locales/cy/activity-stream-strings.js,modified
5,a/browser/components/newtab/prerendered/locales/zh-TW/activity-stream-strings.js,b/browser/components/newtab/prerendered/locales/zh-TW/activity-stream-strings.js,modified
6,a/browser/components/newtab/prerendered/locales/oc/activity-stream-strings.js,b/browser/components/newtab/prerendered/locales/oc/activity-stream-strings.js,modified
7,a/browser/components/newtab/prerendered/locales/cs/activity-stream-strings.js,b/browser/components/newtab/prerendered/locales/cs/activity-stream-strings.js,modified
8,a/browser/components/newtab/prerendered/locales/sv-SE/activity-stream-strings.js,b/browser/components/newtab/prerendered/locales/sv-SE/activity-stream-strings.js,modified
9,a/browser/components/newtab/prerendered/locales/pt-PT/activity-stream-strings.js,b/browser/components/newtab/prerendered/locales/pt-PT/activity-stream-strings.js,modified
10,a/browser/components/newtab/prerendered/locales/cak/activity-stream-strings.js,b/browser/components/newtab/prerendered/locales/cak/activity-stream-strings.js,modified


How do I retrieve the list of changesets associated with a specific bug?

In [11]:
bug_id = "1000185"

df = get_list_of_changesets(bug_id)
if df is not None:
    df.index = range(1, len(df) + 1)
    styled_df = df.style.set_caption(f"Changesets in Bug Id: {bug_id}")
    display(styled_df)  # Use IPython's display function to render in Jupyter Notebook

Unnamed: 0,Changeset Hash ID,Bug ID mentioned in Changeset Summary,Changeset mentioned in Bug Comments
1,2ea2a3b3df0dbab34ad4dd38dc706a4d1e08f518,,X
2,542f83ec634547e7abdaea27a5fdf117ca1f0a97,X,X
3,6578c5aa9eb317b1322d0b1d24f28fe36ed19797,X,X
4,66992fa55eb6dee7884da95fac674d5c6d8193bb,X,X
5,6792ae50367fa0ac3fdb6a5f4575bc01f20c55a1,X,X
6,728150a9571eb79e56df021b044caedfb5e72e93,X,X
7,8c3afb2c80aea620026cc895ff670c1516a746db,X,X
8,92ecf648b737f9d31a35bd13b4b33f6138ec5975,X,X
9,a399ae7c942562a66edb1e0667100dd93dff468e,X,X
10,b832be241524ff3e007625a1a5abe51572bb5b7d,X,X
