In [1]:
import re
import psycopg2
import emoji

In [2]:
def process_images(text):
    text = re.sub(r'!\[Image\]\S*', '<IMAGE>', text)
    return text

def process_urls(text):
    text = re.sub(r'http\S+|www\.\S+', '<URL>', text)
    return text

def process_code_blocks(text):
    text = re.sub(r'```(.*?)```', '<CODE_BLOCK>', text, flags=re.DOTALL)
    return text

def process_reply(text):
    text = re.sub(r'(>.)(.*)', r'<user is replying to: \2 >', text)
    return text

def process_emojis(text):
    text = emoji.demojize(text, delimiters=("<emoji_", ">"))
    return text

def create_pipeline(functions):
    def pipeline(text):
        for func in functions:
            text = func(text)
        return text
    return pipeline


In [None]:
pipeline_functions = [process_code_blocks, process_images, process_urls, process_emojis]
pipeline = create_pipeline(pipeline_functions)

In [None]:
DATABASE_CONFIG = {
    "dbname": "tensorflow_data",
    "user": "postgres",
    "password": "",
    "host": "",
    "port": 5432,
}

conn = psycopg2.connect(**DATABASE_CONFIG)
cursor = conn.cursor()

In [5]:
cursor.execute(
    """
    SELECT 
        issue_id,
        body
    FROM 
        issues_from_release
    WHERE
        release_number LIKE '%2.18%'
    """
)
issues = cursor.fetchall()
len(issues)

78

In [None]:
for issue in issues:
    issue_id, issue_body = issue
    
    # Process the body of the issue
    issue_body = pipeline(issue_body)
    
    # Save the processed issue body 
    cursor.execute(
        "UPDATE issues_from_release SET body = %s WHERE issue_id = %s", (issue_body, issue_id)
    )

    # Getting the comments of the current issue
    cursor.execute(
        """
        SELECT 
            comment_id,
            body
        FROM 
            issue_comments_from_release
        WHERE 
            issue_id = %s;
        """, (issue_id,)
    )
    comments = cursor.fetchall()
    print(f"Found {len(comments)} comments for issue {issue_id}.")

    # Processing the body of each comment
    for comment in comments:
        comment_id, comment_body = comment

        # Process de body of the comment
        comment_body = pipeline(comment_body)

        # Save the processed comment body
        cursor.execute(
            "UPDATE issue_comments_from_release SET body = %s WHERE comment_id = %s",
            (comment_body, comment_id,)
        )

conn.commit()  