In [1]:
import sys
import os
from functools import wraps
from collections.abc import Iterable
import logging

from tqdm import tqdm
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.engine import Connection

import config

logger = logging.getLogger(__name__)


def loose_equality(string1: str, string2: str):
    """Required to make use of binary search"""
    return string1[:100] == string2[:100]


def count_invocations(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        wrapper.invocations += 1
        result = func(*args, **kwargs)
        return result
    wrapper.invocations = 0
    return wrapper


def binary_search(key: str, target: Iterable) -> int:
    """loose equality used instead of strict comparison"""
    start = 0
    end = len(target)
    section = end - start
    pivot = start + section // 2
    pivot_element = target[pivot]

    while section > 1:
        if loose_equality(pivot_element, key):
            return pivot
        elif pivot_element > key:
            end = pivot
        else:
            start = pivot
        section = end - start
        pivot = start + section // 2
        pivot_element = target[pivot]

    if loose_equality(pivot_element, key):
        return pivot
    return -1


def read_original_table(connection: Connection) -> pd.DataFrame:
    table = pd.read_sql_table("texts", con=connection, schema=config.DB, index_col=None)
    return table


def read_new_table(filename: str) -> pd.Series:
    table: pd.DataFrame = pd.read_csv(filename, usecols=["текст"], sep="\t").dropna(axis=0)
    return table


def get_sorted_column(new_table: pd.DataFrame) -> list:
    assert "текст" in new_table.columns
    return new_table["текст"].sort_values().tolist()


def parse_args(args: list) -> str:
    assert len(args) > 1, "Filename is missing"
    filename: str = args[1]
    assert os.path.isfile(filename), f"File {filename} does not exist"
    assert filename.endswith(".csv"), "Invalid file type, .csv required"    
    return filename


@count_invocations
def reinsert(new_text: str , _id: int, conn: Connection) -> None:
    assert isinstance(new_text, str)
    conn.execute(f"UPDATE texts set raw_text = %s where id = %s", (new_text, str(_id)))
    return


def find_equals(row: pd.Series, updated: pd.Series, conn: Connection) -> None:
    _id, text = row[["id", "raw_text"]]
    new_idx = binary_search(text, updated)
    if new_idx == -1:
        return
    new_text = updated[new_idx]
    if new_text == text:
        return
    reinsert(new_text, _id, conn)


def main(filename: str) -> None:
    new_table: pd.DataFrame = read_new_table(filename)
    new_values: list = get_sorted_column(new_table)
    engine = create_engine("mysql+pymysql://{}:{}@{}:{}/{}".format(
        config.USER,
        config.PASSWORD,
        config.HOST,
        config.PORT,
        config.DB
    ))
    conn = engine.connect()
    original_table: pd.DataFrame = read_original_table(connection=conn)
    original_table.dropna(axis=0, subset=["raw_text"], inplace=True)
    for idx, row in tqdm(original_table.iterrows()):
        find_equals(row, new_values, conn)
        # if idx > 100:
        #     break
    print(f"reinsert function has been invoked {str(reinsert.invocations)} times.")
    conn.close()

In [3]:
main("data/undone/tblCards.csv")

11081it [00:27, 406.01it/s] 

2988





In [None]:
if __name__ == "__main__":
    filename = parse_args(sys.argv)
    main(filename)