In [1]:
from getpass import getpass

PASSWORD=getpass("MySQL Password: ")

MySQL Password: ········


# Helper functions

In [2]:
from sqlalchemy import create_engine
from sqlalchemy import text

engine_remote = create_engine(
    f"mysql+mysqlconnector://serlo:{PASSWORD}@localhost:7777/serlo?charset=utf8mb4"
)

engine_local = create_engine(
    f"mysql+mysqlconnector://root:secret@localhost:3306/serlo?charset=latin1"
)

In [4]:
class MySQLSession:
    def __init__(self, engine):
        self.engine = engine
        
    def __enter__(self):
        self.connection = self.engine.connect()
        return self
    
    def __exit__(self, *args):
        self.connection.close()
        
    def execute(self, statement, **kwargs):
        return self.connection.execute(text(statement), kwargs)

    def query(self, statement, **kwargs):
        return list(self.execute(statement, **kwargs))
    
    def begin(self):
        return self.connection.begin()
    
with MySQLSession(engine_remote) as session:
    pass

# Convert all tables to utf8mb4

In [5]:
def change_default_character_set():
    with MySQLSession(engine_remote) as session:
        trans = session.begin()

        session.execute("""
            alter database serlo character set utf8mb4 COLLATE utf8mb4_unicode_520_ci
        """)

        default_character_set, default_collation = session.query("""
            SELECT default_character_set_name, default_collation_name 
                FROM information_schema.SCHEMATA 
                WHERE schema_name = 'serlo';
        """)[0]

        if default_character_set == "utf8mb4" and default_collation == "utf8mb4_unicode_520_ci":
            trans.commit()
        else:
            trans.rollback()

change_default_character_set()

In [6]:
def convert_to_utf8(session, table):
    session.execute(f"""
        ALTER TABLE {table} Convert To CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci
    """)

def fix_serlo_db_character_set():
    with MySQLSession(engine_remote) as session:
        trans = session.begin()

        # Avoid error for invalid timestamps in column
        result = session.execute("""
            SET sql_mode = 'ALLOW_INVALID_DATES';
        """)

        assert result.rowcount == 0

        tables = [t[0] for t in session.query("""
            SELECT DISTINCT TABLE_NAME FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = 'serlo';
        """)]

        for table in tables:
            print(f"Update {table} to utf8mb4")
            convert_to_utf8(session, table)

        result = session.query("""
            SELECT DISTINCT CHARACTER_SET_NAME, COLLATION_NAME FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = 'serlo';
        """)

        def is_valid_character_set(row):
            return row == [None, None] or row == ["utf8mb4", "utf8mb4_unicode_520_ci"]

        if all(is_valid_character_set(row) for row in result):
            trans.commit()
        else:
            trans.rollback()

fix_serlo_db_character_set()

Update ad to utf8mb4
Update ad_page to utf8mb4
Update attachment_container to utf8mb4
Update attachment_file to utf8mb4
Update blog_post to utf8mb4
Update comment to utf8mb4
Update comment_vote to utf8mb4
Update entity to utf8mb4
Update entity_link to utf8mb4
Update entity_revision to utf8mb4
Update entity_revision_field to utf8mb4
Update event to utf8mb4
Update event_log to utf8mb4
Update event_parameter to utf8mb4
Update event_parameter_name to utf8mb4
Update event_parameter_string to utf8mb4
Update event_parameter_uuid to utf8mb4
Update flag to utf8mb4
Update instance to utf8mb4
Update instance_permission to utf8mb4
Update language to utf8mb4
Update license to utf8mb4
Update metadata to utf8mb4
Update metadata_key to utf8mb4
Update migrations to utf8mb4
Update navigation_container to utf8mb4
Update navigation_page to utf8mb4
Update navigation_parameter to utf8mb4
Update navigation_parameter_key to utf8mb4
Update notification to utf8mb4
Update notification_event to utf8mb4
Update pag

# Fix entries

In [7]:
def is_not_utf8mb3_compatible(text):
    for char in text:
        if ord(char) > 0xFFFF:
            return True
    return False

def decode_text(content):
    return bytes(content, "latin1").decode("utf8", errors='replace')

def update_values(session_remote, session_local, table, column):
    logs = []
    
    rows = session_local.query(f"""
        select id, {column} from {table}
    """)
    rows = [{"id": r[0], "value": decode_text(r[1])} for r in rows if r[1] is not None]
    rows = [r for r in rows if is_not_utf8mb3_compatible(r["value"])]
    
    print(f"INFO: {len(rows)} items need to be updated in {table}.{column}")
    
    for row in rows:
        column_id=row["id"]
        
        old_value = session_remote.query(f"""
            select {column} from {table} where id = :column_id
        """, column_id = column_id)[0][0]
        
        result = session_remote.execute(f"""
            update {table} set {column} = :value where id = :column_id
        """, value=row["value"], column_id=column_id)
        
        # Let's make sure that we only change one row
        assert result.rowcount == 1
        
        print(f"INFO: Repair column {column} of {table} with id {column_id}")
        
        logs.append({
            "table": table,
            "column": column,
            "id": column_id,
            "old_value": old_value,
            "new_value": row["value"]
        })
    
    return logs

def fix_serlo_db():
    with MySQLSession(engine_remote) as session_remote:
        with MySQLSession(engine_local) as session_local:
            trans = session_remote.begin()
            db_info = session_remote.query("""
                SELECT TABLE_NAME, COLUMN_NAME, CHARACTER_SET_NAME, COLLATION_NAME
                FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = 'serlo';
            """)

            changes = []

            for info_row in db_info:
                table, column, character_set, _ = info_row

                if character_set is not None and character_set.startswith("utf8"):
                    print(f"Update {table}.{column}")
                    changes += update_values(session_remote, session_local, table, column)

            trans.commit()
            return changes
    
changes = fix_serlo_db()

print(f"Number of changes: {len(changes)}")

Update ad.content
INFO: 0 items need to be updated in ad.content
Update ad.title
INFO: 0 items need to be updated in ad.title
Update ad.url
INFO: 0 items need to be updated in ad.url
Update attachment_file.location
INFO: 0 items need to be updated in attachment_file.location
Update attachment_file.name
INFO: 0 items need to be updated in attachment_file.name
Update attachment_file.type
INFO: 0 items need to be updated in attachment_file.type
Update blog_post.content
INFO: 0 items need to be updated in blog_post.content
Update blog_post.title
INFO: 0 items need to be updated in blog_post.title
Update comment.content
INFO: 74 items need to be updated in comment.content
INFO: Repair column content of comment with id 68487
INFO: Repair column content of comment with id 83727
INFO: Repair column content of comment with id 100243
INFO: Repair column content of comment with id 135978
INFO: Repair column content of comment with id 139613
INFO: Repair column content of comment with id 147228
IN

INFO: Repair column value of entity_revision_field with id 204611
INFO: Repair column value of entity_revision_field with id 204955
INFO: Repair column value of entity_revision_field with id 205212
INFO: Repair column value of entity_revision_field with id 205230
INFO: Repair column value of entity_revision_field with id 205236
INFO: Repair column value of entity_revision_field with id 205242
INFO: Repair column value of entity_revision_field with id 205248
INFO: Repair column value of entity_revision_field with id 205254
INFO: Repair column value of entity_revision_field with id 205260
INFO: Repair column value of entity_revision_field with id 205266
INFO: Repair column value of entity_revision_field with id 205272
INFO: Repair column value of entity_revision_field with id 205278
INFO: Repair column value of entity_revision_field with id 205947
INFO: Repair column value of entity_revision_field with id 205979
INFO: Repair column value of entity_revision_field with id 206034
INFO: Repa

INFO: Repair column value of entity_revision_field with id 357436
INFO: Repair column value of entity_revision_field with id 357456
INFO: Repair column value of entity_revision_field with id 357462
INFO: Repair column value of entity_revision_field with id 357909
INFO: Repair column value of entity_revision_field with id 358993
INFO: Repair column value of entity_revision_field with id 358999
INFO: Repair column value of entity_revision_field with id 359011
INFO: Repair column value of entity_revision_field with id 359017
INFO: Repair column value of entity_revision_field with id 359023
INFO: Repair column value of entity_revision_field with id 359035
INFO: Repair column value of entity_revision_field with id 359139
INFO: Repair column value of entity_revision_field with id 359163
INFO: Repair column value of entity_revision_field with id 363154
INFO: Repair column value of entity_revision_field with id 363170
INFO: Repair column value of entity_revision_field with id 363871
INFO: Repa

INFO: Repair column value of entity_revision_field with id 418007
INFO: Repair column value of entity_revision_field with id 418227
INFO: Repair column value of entity_revision_field with id 418233
INFO: Repair column value of entity_revision_field with id 418238
INFO: Repair column value of entity_revision_field with id 418242
INFO: Repair column value of entity_revision_field with id 418248
INFO: Repair column value of entity_revision_field with id 418254
INFO: Repair column value of entity_revision_field with id 418264
INFO: Repair column value of entity_revision_field with id 418268
INFO: Repair column value of entity_revision_field with id 418275
INFO: Repair column value of entity_revision_field with id 418279
INFO: Repair column value of entity_revision_field with id 418282
INFO: Repair column value of entity_revision_field with id 418292
INFO: Repair column value of entity_revision_field with id 418301
INFO: Repair column value of entity_revision_field with id 418308
INFO: Repa

INFO: Repair column value of entity_revision_field with id 433337
INFO: Repair column value of entity_revision_field with id 433344
INFO: Repair column value of entity_revision_field with id 434566
INFO: Repair column value of entity_revision_field with id 434574
INFO: Repair column value of entity_revision_field with id 434577
INFO: Repair column value of entity_revision_field with id 434583
INFO: Repair column value of entity_revision_field with id 434589
INFO: Repair column value of entity_revision_field with id 434590
INFO: Repair column value of entity_revision_field with id 434597
INFO: Repair column value of entity_revision_field with id 434604
INFO: Repair column value of entity_revision_field with id 434608
INFO: Repair column value of entity_revision_field with id 434876
INFO: Repair column value of entity_revision_field with id 434878
INFO: Repair column value of entity_revision_field with id 434884
INFO: Repair column value of entity_revision_field with id 434890
INFO: Repa

INFO: Repair column content of page_revision with id 259053
INFO: Repair column content of page_revision with id 259061
INFO: Repair column content of page_revision with id 259083
INFO: Repair column content of page_revision with id 259791
INFO: Repair column content of page_revision with id 259837
INFO: Repair column content of page_revision with id 259839
INFO: Repair column content of page_revision with id 259843
Update page_revision.title
INFO: 0 items need to be updated in page_revision.title
Update permission.name
INFO: 0 items need to be updated in permission.name
Update related_content_category.name
INFO: 0 items need to be updated in related_content_category.name
Update related_content_external.title
INFO: 0 items need to be updated in related_content_external.title
Update related_content_external.url
INFO: 0 items need to be updated in related_content_external.url
Update related_content_internal.title
INFO: 0 items need to be updated in related_content_internal.title
Update r

In [8]:
import json

with open("/tmp/changes_to_db.json", "w") as f:
    json.dump(changes, f, indent=2)