In [1]:
import re
import psycopg2
import emoji

In [None]:
def process_images(text):
    try:
        text = re.sub(r'!\[Image\]\S*', '<IMAGE>', text)
    except Exception as e:
        print(f"Unexpected error occurred: {e}")
    return text

def process_urls(text):
    try:
        text = re.sub(r'http\S+|www\.\S+', '<URL>', text)
    except Exception as e:
        print(f"Unexpected error occurred: {e}")
    return text

def process_code_blocks(text):
    try:
        text = re.sub(r'```(.*?)```', '<CODE_BLOCK>', text, flags=re.DOTALL)
    except Exception as e:
        print(f"Unexpected error occurred: {e}")
    return text

def process_reply(text):
    try:
        text = re.sub(r'(>.)(.*)', r'<user is replying to: \2 >', text)
    except Exception as e:
        print(f"Unexpected error occurred: {e}")
    return text

def process_emojis(text):
    try:
        text = emoji.demojize(text, delimiters=("<emoji_", ">"))
    except Exception as e:
        print(f"Unexpected error occurred: {e}")
    return text

def create_pipeline(functions):
    def pipeline(text):
        for func in functions:
            text = func(text)
        return text
    return pipeline


In [3]:
pipeline_functions = [process_code_blocks, process_images, process_urls, process_emojis]
pipeline = create_pipeline(pipeline_functions)

In [None]:
DATABASE_CONFIG = {
    "dbname": "tensorflow_data",
    "user": "postgres",
    "password": "",
    "host": "",
    "port": 5432,
}

conn = psycopg2.connect(**DATABASE_CONFIG)
cursor = conn.cursor()

In [5]:
cursor.execute(
    """
    SELECT 
        issue_id,
        body
    FROM 
        issues_from_release
    WHERE
        release_number LIKE '%2.12%'
    """
)
issues = cursor.fetchall()
len(issues)

305

In [6]:
for issue in issues:
    issue_id, issue_body = issue
    
    # Process the body of the issue
    issue_body = pipeline(issue_body)
    
    # Save the processed issue body 
    cursor.execute(
        "UPDATE issues_from_release SET body = %s WHERE issue_id = %s", (issue_body, issue_id)
    )

    # Getting the comments of the current issue
    cursor.execute(
        """
        SELECT 
            comment_id,
            body
        FROM 
            issue_comments_from_release
        WHERE 
            issue_id = %s;
        """, (issue_id,)
    )
    comments = cursor.fetchall()
    # print(f"Found {len(comments)} comments for issue {issue_id}.")

    # Processing the body of each comment
    for comment in comments:
        comment_id, comment_body = comment

        # Process de body of the comment
        print(f"Processing comment {comment_id}...")
        comment_body = pipeline(comment_body)

        # Save the processed comment body
        cursor.execute(
            "UPDATE issue_comments_from_release SET body = %s WHERE comment_id = %s",
            (comment_body, comment_id,)
        )

conn.commit()  
print(f"Alterations successfully saved!")

Processing comment 1695513249...
Processing comment 1695535407...
Processing comment 1695558072...
Processing comment 1696165580...
Processing comment 1715143031...
Processing comment 1719905957...
Processing comment 1719924313...
Processing comment 1731207277...
Processing comment 1774201238...
Processing comment 1971701292...
Processing comment 1973750640...
Processing comment 1985293609...
Processing comment 2505483522...
Processing comment 2512464527...
Processing comment 1643249722...
Processing comment 1654858331...
Processing comment 1665594103...
Processing comment 1674244153...
Processing comment 1674262801...
Processing comment 1675929397...
Processing comment 1676338546...
Processing comment 1685497041...
Processing comment 1686093005...
Processing comment 1771228139...
Processing comment 1771659417...
Processing comment 1791589854...
Processing comment 1793844504...
Processing comment 1854870692...
Processing comment 1914122112...
Processing comment 1928633287...
Processing