In [None]:
import pyodbc
import requests
import pandas as pd

# Connection string
# Enters your database 'SERVER' name:
conn_str = 'DRIVER={ODBC Driver 18 for SQL Server};' \
           'SERVER=<Enter your SQL Server Name>;' \
           'DATABASE=FixFox;' \
           'Connection Timeout=300;' \
           'Login Timeout=300;' \
           'LongAsMax=yes;' \
           'TrustServerCertificate=yes;' \
           'Trusted_Connection=yes;'

def print_raw_file_links_by_changeset_hash(input_changeset_hash):
    global conn_str
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()

    try:
        cursor.execute('''  
            SELECT cf.Previous_File_Name,
                cf.Updated_File_Name,
                c.Mercurial_Type,
                cf.Changeset_Hash_ID,
                c.Parent_Hashes
            FROM Changeset_Files cf
            INNER JOIN Changeset_Details c ON c.Hash_Id = cf.Changeset_Hash_ID
            WHERE cf.Changeset_Hash_ID = ?;
        ''', input_changeset_hash)

        records = cursor.fetchall()
    finally:
        cursor.close()
        conn.close()

    link_format = "https://hg.mozilla.org/{mercurial_type}/raw-file/{changeset_hash_id}/{file_path}"

    def generate_file_link(mercurial_type, changeset_hash_id, file_path):
        return link_format.format(
            mercurial_type=mercurial_type,
            changeset_hash_id=changeset_hash_id,
            file_path=file_path[2:]  # Remove "a/" or "b/" prefixes
        )

    count = 1
    for record in records:
        prev_file, updated_file, mercurial_type, changeset_hash, parent_hash = record
        
        mercurial_types = mercurial_type.split(' | ')
        print(f"File {str(count)} - Previous File: '{prev_file}' && Updated File: '{updated_file}'")
        print(f"File {str(count)} - List of Raw File Links for Previous File:")
        if prev_file != '/dev/null':
            for mercurial in mercurial_types:
                print(f"[{mercurial}]: {generate_file_link(mercurial, changeset_hash, prev_file)}")
        else:
            print("- No available")

        print(f"File {str(count)} - List of Raw File Links for Updated File:")
        if updated_file != '/dev/null':
            for mercurial in mercurial_types:
                print(f"[{mercurial}]: {generate_file_link(mercurial, changeset_hash, updated_file)}")
        else:
            print("- No available")
        print("\n")
        count += 1

    if len(records) == 0:
        print(f"No files associated with the changeset: {input_changeset_hash}")


def extract_file_content(changeset_hash_id, mercurial_type, fully_qualified_filename):
    sanitized_filename = fully_qualified_filename[2:] if fully_qualified_filename.startswith(("a/", "b/")) else fully_qualified_filename

    link_format = "https://hg.mozilla.org/{mercurial_type}/raw-file/{changeset_hash_id}/{file_path}"
    url = link_format.format(
        mercurial_type=mercurial_type,
        changeset_hash_id=changeset_hash_id,
        file_path=sanitized_filename
    )

    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch file content from {url}: {e}")
        return None


def get_file_type_distribution():
    global conn_str

    query = '''
    -- Get file type statistics
    SELECT 
        File_Type,
        Total_Count,
        ROUND((CAST(Total_Count AS FLOAT) / (SELECT COUNT(*) FROM Changeset_Files WHERE 
            (Previous_File_Name LIKE '%.js' OR Previous_File_Name LIKE '%.py' OR Previous_File_Name LIKE '%.c' OR Previous_File_Name LIKE '%.cpp')
            OR 
            (Updated_File_Name LIKE '%.js' OR Updated_File_Name LIKE '%.py' OR Updated_File_Name LIKE '%.c' OR Updated_File_Name LIKE '%.cpp')
        )) * 100, 2) AS Percentage
    FROM (
        SELECT 
            'JavaScript (.js)' AS File_Type, 
            COUNT(*) AS Total_Count
        FROM Changeset_Files
        WHERE 
            (Previous_File_Name LIKE '%.js' OR Updated_File_Name LIKE '%.js')
        UNION ALL
        SELECT 
            'Python (.py)' AS File_Type, 
            COUNT(*) AS Total_Count
        FROM Changeset_Files
        WHERE 
            (Previous_File_Name LIKE '%.py' OR Updated_File_Name LIKE '%.py')
        UNION ALL
        SELECT 
            'C (.c)' AS File_Type, 
            COUNT(*) AS Total_Count
        FROM Changeset_Files
        WHERE 
            (Previous_File_Name LIKE '%.c' OR Updated_File_Name LIKE '%.c')
        UNION ALL
        SELECT 
            'C++ (.cpp)' AS File_Type, 
            COUNT(*) AS Total_Count
        FROM Changeset_Files
        WHERE 
            (Previous_File_Name LIKE '%.cpp' OR Updated_File_Name LIKE '%.cpp')
    ) AS File_Stats
    ORDER BY Percentage DESC;
    '''

    try:
        conn = pyodbc.connect(conn_str)
        cursor = conn.cursor()
        try:
            cursor.execute(query)
            records = cursor.fetchall()  # Fetch all records
            columns = [column[0] for column in cursor.description]  # Extract column names
            df = pd.DataFrame.from_records(records, columns=columns)  # Convert to DataFrame
        finally:
            cursor.close()
        conn.close()
        return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


def get_changeset_files(changeset_hash):
    global conn_str

    try:
        conn = pyodbc.connect(conn_str)
        cursor = conn.cursor()
        try:
            cursor.execute('''
                SELECT File_Status,
                    Previous_File_Name, 
                    Updated_File_Name
                FROM Changeset_Files
                WHERE Changeset_Hash_ID = ?;
                ''', (changeset_hash,))
            records = cursor.fetchall()
        finally:
            cursor.close()
        conn.close()

        if not records:
            print(f"No records found for Changeset Hash ID: {changeset_hash}")
            return None
        else:
            df = pd.DataFrame.from_records(
                records, 
                columns=["File Status", "Previous File Name", "Updated File Name"]
            )
            return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
def get_list_of_changesets(bug_id):
    global conn_str

    try:
        conn = pyodbc.connect(conn_str)
        cursor = conn.cursor()
        try:
            cursor.execute('''
                SELECT DISTINCT
                    Changeset_Hash_ID,
                    Changeset_Summary,
                    Mercurial_Type
                FROM Changeset_Bug_Mapping m
                INNER JOIN Changeset_Details c ON c.Hash_ID = m.Changeset_Hash_ID
                WHERE Bug_ID = ?;
                ''', (bug_id,))
            records = cursor.fetchall()
        finally:
            cursor.close()
        conn.close()

        if not records:
            print(f"No records found for Changeset Bug ID: {bug_id}")
            return None
        else:
            df = pd.DataFrame.from_records(
                records, 
                columns=["Changeset_Hash_ID", "Changeset_Summary", "Mercurial_Type"]
            )
            return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def get_list_of_modified_functions(changeset_hash_id):
    global conn_str

    try:
        conn = pyodbc.connect(conn_str)
        cursor = conn.cursor()
        try:
            cursor.execute('''
                SELECT
                    cfile.Changeset_Hash_ID,
                    CASE
                        WHEN cfile.Updated_File_Name = '/dev/null' THEN cfile.Previous_File_Name
                        ELSE cfile.Updated_File_Name
                    END AS [Modified_File_Name],
                    cfunc.Function_Name AS [Modified_Function_Name]
                FROM Changeset_Files cfile
                INNER JOIN Changeset_Modified_Functions cfunc
                    ON cfunc.Changeset_File_Unique_Hash = cfile.Unique_Hash
                WHERE cfile.Changeset_Hash_ID = ?
                ORDER BY [Modified_File_Name], [Modified_Function_Name]
                ''', (changeset_hash_id,))
            records = cursor.fetchall()
        finally:
            cursor.close()
        conn.close()

        if not records:
            print(f"No records found for Changeset Hash ID: {changeset_hash_id}")
            return None
        else:
            df = pd.DataFrame.from_records(
                records,
                columns=["Changeset_Hash_ID", "Modified_File_Name", "Modified_Function_Name"]
            )
            return df
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

How to formulate raw file URL links based on a specific changeset?

In [8]:
# Print out the list of raw file links based on the input changeset hash
changeset_hash_id = "010f3ed252635ba277844b24ed7b073592d5e8df"
print_raw_file_links_by_changeset_hash(changeset_hash_id)

File 1 - Previous File: '/dev/null' && Updated File: 'b/browser/components/extensions/test/browser/browser_ext_contextMenus_srcUrl_redirect.js'
File 1 - List of Raw File Links for Previous File:
- No available
File 1 - List of Raw File Links for Updated File:
[mozilla-central]: https://hg.mozilla.org/mozilla-central/raw-file/010f3ed252635ba277844b24ed7b073592d5e8df/browser/components/extensions/test/browser/browser_ext_contextMenus_srcUrl_redirect.js
[integration/autoland]: https://hg.mozilla.org/integration/autoland/raw-file/010f3ed252635ba277844b24ed7b073592d5e8df/browser/components/extensions/test/browser/browser_ext_contextMenus_srcUrl_redirect.js


File 2 - Previous File: 'a/browser/base/content/nsContextMenu.js' && Updated File: 'b/browser/base/content/nsContextMenu.js'
File 2 - List of Raw File Links for Previous File:
[mozilla-central]: https://hg.mozilla.org/mozilla-central/raw-file/010f3ed252635ba277844b24ed7b073592d5e8df/browser/base/content/nsContextMenu.js
[integration/aut

How to extract a specific file content?

In [9]:
# Define changeset hash and mercurial type:
changeset_hash_id = "010f3ed252635ba277844b24ed7b073592d5e8df"
mercurial_type = "mozilla-central"

# Example of print out file content:
fully_qualified_filename_a = "a/browser/base/content/nsContextMenu.js"
file_content_a = extract_file_content(changeset_hash_id, mercurial_type, fully_qualified_filename_a)
print(file_content_a) # Print out file content


# Example of saving file content to .txt:
fully_qualified_filename_b = "b/browser/base/content/nsContextMenu.js"
file_content_b = extract_file_content(changeset_hash_id, mercurial_type, fully_qualified_filename_b)
if file_content_b:
    # Simplify the filename for saving
    simplified_filename = fully_qualified_filename_b.replace("/", "_").replace("\\", "_")
    output_path = f"./{simplified_filename}.txt"

    # Write to file
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(file_content_b)
        print(f"Content saved to {output_path}")
else:
    print("Failed to fetch file content.")

/* -*- tab-width: 2; indent-tabs-mode: nil; js-indent-level: 2 -*- */
/* vim: set ts=2 sw=2 sts=2 et tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

const PASSWORD_FIELDNAME_HINTS = ["current-password", "new-password"];
const USERNAME_FIELDNAME_HINT = "username";

function openContextMenu(aMessage, aBrowser, aActor) {
  if (BrowserHandler.kiosk) {
    // Don't display context menus in kiosk mode
    return;
  }
  let data = aMessage.data;
  let browser = aBrowser;
  let actor = aActor;
  let spellInfo = data.spellInfo;
  let frameReferrerInfo = data.frameReferrerInfo;
  let linkReferrerInfo = data.linkReferrerInfo;
  let principal = data.principal;
  let storagePrincipal = data.storagePrincipal;

  let documentURIObject = makeURI(
    data.docLocation,
    data.charSet,
    makeURI(data.baseURI)
  );

  if (frameReferrer

How do I get the distribution of file types in the dataset?

In [10]:
# Print out File Type Distribution:
df = get_file_type_distribution()
df.index = range(1, len(df) + 1)
df.style.set_caption("File Type Distribution").format({'Percentage': "{:.2f}%"})

Unnamed: 0,File_Type,Total_Count,Percentage
1,C++ (.cpp),2606,66.84%
2,JavaScript (.js),894,22.93%
3,C (.c),333,8.54%
4,Python (.py),66,1.69%


How to get the list of files associated with a specific changeset/commit?

In [11]:
changeset_hash = "d53e368934524dd77444c4b607c8b02a9aab96c2"
df = get_changeset_files(changeset_hash)

if df is not None:
    df.index = range(1, len(df) + 1)
    styled_df = df.style.set_caption(f"Files in Changeset: {changeset_hash}").set_properties(**{'text-align': 'left'}).set_table_styles([
        {'selector': 'thead th', 'props': [('text-align', 'left')]}
    ])
    display(styled_df)  # Use IPython's display function to render in Jupyter Notebook

Unnamed: 0,File Status,Previous File Name,Updated File Name
1,modified,a/layout/generic/nsTextFrameThebes.cpp,b/layout/generic/nsTextFrameThebes.cpp
2,modified,a/layout/mathml/nsMathMLmoFrame.cpp,b/layout/mathml/nsMathMLmoFrame.cpp
3,modified,a/layout/mathml/nsMathMLTokenFrame.cpp,b/layout/mathml/nsMathMLTokenFrame.cpp


How do I retrieve the list of changesets associated with a specific bug?

In [12]:
bug_id = "1000185"

df = get_list_of_changesets(bug_id)
if df is not None:
    df.index = range(1, len(df) + 1)
    styled_df = df.style.set_caption(f"Changesets in Bug Id: {bug_id}").set_properties(**{'text-align': 'left'}).set_table_styles([
        {'selector': 'thead th', 'props': [('text-align', 'left')]}
    ])
    display(styled_df)

Unnamed: 0,Changeset_Hash_ID,Changeset_Summary,Mercurial_Type
1,66992fa55eb6dee7884da95fac674d5c6d8193bb,"Daniel Holbert - Bug 1000185 - Part 1: Perform synchronous SMIL sample after registering with refresh driver, not before, for consistency. r=birtles",mozilla-central
2,bd0a38a9b756183b9d68f2f0bdea08c99ed34acb,Daniel Holbert - Bug 1000185 - Part 2: Add a bool to keep track of whether nsSMILAnimationController instances are registered with a refresh driver. r=birtles,mozilla-central


How do I get the list of modified functions in a given changeset/commit?

In [13]:
changeset_hash_id = "d53e368934524dd77444c4b607c8b02a9aab96c2"

df = get_list_of_modified_functions(changeset_hash_id)
if df is not None:
    df.index = range(1, len(df) + 1)
    styled_df = df.style.set_caption(f"Modified Functions in Changeset: {changeset_hash_id}").set_properties(**{'text-align': 'left'}).set_table_styles([
        {'selector': 'thead th', 'props': [('text-align', 'left')]}
    ])
    display(styled_df)

Unnamed: 0,Changeset_Hash_ID,Modified_File_Name,Modified_Function_Name
1,d53e368934524dd77444c4b607c8b02a9aab96c2,b/layout/generic/nsTextFrameThebes.cpp,nsTextFrame::ReflowText
2,d53e368934524dd77444c4b607c8b02a9aab96c2,b/layout/mathml/nsMathMLmoFrame.cpp,nsMathMLmoFrame::ProcessTextData
3,d53e368934524dd77444c4b607c8b02a9aab96c2,b/layout/mathml/nsMathMLTokenFrame.cpp,CompressWhitespace
4,d53e368934524dd77444c4b607c8b02a9aab96c2,b/layout/mathml/nsMathMLTokenFrame.cpp,nsMathMLTokenFrame::AppendFrames
5,d53e368934524dd77444c4b607c8b02a9aab96c2,b/layout/mathml/nsMathMLTokenFrame.cpp,nsMathMLTokenFrame::ForceTrimChildTextFrames
6,d53e368934524dd77444c4b607c8b02a9aab96c2,b/layout/mathml/nsMathMLTokenFrame.cpp,nsMathMLTokenFrame::GetMathMLFrameType
7,d53e368934524dd77444c4b607c8b02a9aab96c2,b/layout/mathml/nsMathMLTokenFrame.cpp,nsMathMLTokenFrame::Init
8,d53e368934524dd77444c4b607c8b02a9aab96c2,b/layout/mathml/nsMathMLTokenFrame.cpp,nsMathMLTokenFrame::InsertFrames
9,d53e368934524dd77444c4b607c8b02a9aab96c2,b/layout/mathml/nsMathMLTokenFrame.cpp,nsMathMLTokenFrame::SetInitialChildList
10,d53e368934524dd77444c4b607c8b02a9aab96c2,b/layout/mathml/nsMathMLTokenFrame.cpp,nsMathMLTokenFrame::SetTextStyle
