In [None]:
import polars as pl
from tqdm import tqdm

## Paths

In [None]:
base = "../game-dump/"

In [None]:
# paths to input CSVs
fn0 = base + "broken-byte-calls.csv"
fn1 = base + "broken-analyzed-calls.csv"
old_iat_csv = base + "old-iat.csv"
fn_calls = base + "game.exe.export.csv"

In [None]:
# path to exe with prefilled IAT
out_path = r"G:\Games\FA\FA-EMU\Shipping\GAME_dump_mod.exe"

In [None]:
# path to output CSVs
fn_thunks = base + "thunks.csv"
calls_patch_path = base + "calls_patch.csv"
thunks_patch_path = base + "thunks_patch.csv"
old_iat_patch_path = base + "old_iat_patch.csv"
new_idt_path = base + "new_idt.csv"

# Import data

In [None]:
# load both CSVs
df0 = pl.read_csv(fn0)
df1 = pl.read_csv(fn1)

# Keep only rows from df1 whose Call address is not present in df0
df1_filtered = df1.join(
    df0.select("Call address"),
    on="Call address",
    how="anti",  # keeps only rows NOT matching df0
)

# join (concatenate) them
df = pl.concat([df0, df1_filtered])
calls_table = df

df.head(3)

# Count calls

In [None]:
# count top 5 "Destination" entries
top5 = df.group_by("Destination").len().sort("len", descending=True)

total_calls = df.shape[0]
total_unique = len(df.select("Destination").unique())
top5_unique = top5.select("len").head(5).sum().to_series()[0]

print(total_calls, total_unique, top5_unique)
top5.head()

In [None]:
target = "0x60bcccc9"
# target = '0x7584eb00'
target = "0x7584ea80"
target = "0x60c17149"
target = "0x6a409442"

s = df.filter(pl.col("Destination") == target)
s.head()

# Map to calls

In [None]:
functions_table = pl.read_csv(fn_calls)
functions_table.shape, functions_table.columns

In [None]:
# --- Convert addresses to integers ---
calls_table = calls_table.with_columns(
    pl.col("Destination")
    .str.strip_prefix("0x")
    .map_elements(lambda x: int(x, 16), pl.Int64)
    .alias("Dest_int")
)

functions_table = functions_table.with_columns(
    pl.col("Address").map_elements(lambda x: int(x, 16), pl.Int64).alias("Addr_int")
)

In [None]:
functions_table.filter(pl.col("Function").str.contains("Umbra::MatrixFormat"))[
    "Function"
][1]

In [None]:
functions_table = functions_table.filter(pl.col("Module") != "game.exe")
functions_table

In [None]:
def remove_dupes(df: pl.DataFrame, expr: pl.Expr) -> pl.DataFrame:
    return df.unique(subset=expr, keep="first")

In [None]:
print(functions_table.shape)
functions_table = remove_dupes(functions_table, pl.col("Address"))
functions_table.shape

In [None]:
# --- fix names to decorated ---
dec_undec = pl.read_csv("../anti-debug-dump/names-map.csv")
print(dec_undec.shape)
dec_undec = remove_dupes(dec_undec, pl.col("undecorated"))
dec_undec.shape

In [None]:
right = dec_undec.with_columns(
    pl.col("undecorated").str.replace_all(" ", "").str.slice(0, 200)
)
left = functions_table.with_columns(
    pl.col("Function").str.replace_all(" ", "").str.slice(0, 200)
)

In [None]:
a1 = left.filter(pl.col("Function").str.contains("Umbra::MatrixFormat"))
a2 = right.filter(pl.col("undecorated").str.contains("Umbra::MatrixFormat"))

In [None]:
left = left.join(right, left_on="Function", right_on="undecorated", how="left")
functions_table = (
    left.with_columns(pl.coalesce(pl.col("decorated", "Function")))
    .drop("Function")
    .rename({"decorated": "Function"})
)

In [None]:
# --- Sort for searching ---
functions_table = functions_table.sort("Addr_int")
calls_table = calls_table.sort("Dest_int")

In [None]:
functions_table = functions_table.unique(subset=["Addr_int"], keep="first")

In [None]:
# --- Resolve function names ---
def resolve_func(addr, func_addrs, func_names):
    # find all functions with address <= addr
    lesser = func_addrs[func_addrs <= addr]
    if len(lesser) == 0:
        return None, False
    closest_addr = lesser.max()
    func_name = func_names[func_addrs == closest_addr][0]
    precise = addr == closest_addr
    return func_name, precise

In [None]:
# --- Perform an asof join (find closest smaller or equal function address) ---
joined = calls_table.join_asof(
    functions_table,
    left_on="Dest_int",
    right_on="Addr_int",
    strategy="backward",  # means <= (closest smaller or equal)
)

joined.head(2)

In [None]:
# --- Add Precise flag ---
joined = joined.with_columns(
    (pl.col("Dest_int") == pl.col("Addr_int")).alias("Precise")
)

# manual check that we lost nothing important
# print(joined.filter(pl.col("Precise") == False).shape)
# for caddr, dest in joined.filter(pl.col("Precise") == False).select('Call address', 'Destination').to_numpy():
#     print(caddr, dest)
# Checked: we didn't
# UPD: i don't know anymore

joined = joined.filter("Precise")

In [None]:
# --- Clean up ---
result = joined.select(
    [
        "function",
        "Call address",
        "Instruction",
        "Destination",
        "Module",
        pl.col("Function").alias("Resolved name"),
    ]
)

result

# Gather modules

In [None]:
pl.Config(tbl_rows=10)

In [None]:
modules = (
    result.group_by("Module")
    .agg(pl.col("Resolved name").n_unique().alias("Unique"))
    .sort("Module")
)
# print(modules['Module'].to_numpy())
modules.sort("Unique")

In [None]:
pl.Config(tbl_rows=10)

In [None]:
selected = "dsound.dll"
res_mod = result.filter(pl.col("Module") == selected).sort("Call address")

jmp_n = res_mod.filter(pl.col("Instruction") == "jmp").unique("Destination").shape[0]
call_n = res_mod.filter(pl.col("Instruction") != "jmp").unique("Destination").shape[0]
if jmp_n >= call_n:
    print(f"All({call_n}) imports resolved({jmp_n})")
elif jmp_n < call_n:
    print(f"WARNING! {call_n - jmp_n} imports not resolved")

(res_mod.filter(pl.col("Instruction") == "jmp").unique("Destination")).with_row_index()

In [None]:
modules = modules.with_columns(
    pl.lit("null").alias("Name_begin"),
    pl.lit("null").alias("ILT_begin"),
    pl.lit("null").alias("ILT_length"),
    pl.lit("null").alias("IAT_begin"),
    pl.lit("null").alias("IAT_length"),
)

In [None]:
# fn_m = r"..\anti-debug-dump\modules_metadata.csv"
# modules.write_csv(fn_m)

# Find jump thunks

In [None]:
result.filter(pl.col("Call address") == "0x14ad107")["Instruction"][0]

In [None]:
# Helper function to get a row by address
def get_row_by_address(addr: int):
    return result.filter(pl.col("Call address") == hex(addr))


# a thunk is any `nop, e9 ? ? ? ?`, which is surrounded by at least 2 thunks (or two behind / two in front)
def is_thunk(call_address: str) -> bool:
    # Convert call address to integer for arithmetic operations
    try:
        base_addr = int(call_address, 16)
    except ValueError:
        print("val err")
        return False

    if len(get_row_by_address(base_addr)) == 0:
        print("no command")
        return False

    # Check condition 1: Two neighbors at +6 and -6, both with Instruction="jmp"
    addr_plus6 = base_addr + 6
    addr_minus6 = base_addr - 6

    row_plus6 = get_row_by_address(addr_plus6)
    row_minus6 = get_row_by_address(addr_minus6)

    condition1 = (
        len(row_plus6) > 0
        and len(row_minus6) > 0
        and row_plus6.select(pl.col("Instruction") == "jmp").item()
        and row_minus6.select(pl.col("Instruction") == "jmp").item()
    )

    # Check condition 2: One neighbor at -6 which also has a neighbor at -12, both "jmp"
    addr_minus12 = base_addr - 12
    row_minus12 = get_row_by_address(addr_minus12)

    condition2 = (
        len(row_minus6) > 0
        and len(row_minus12) > 0
        and row_minus6.select(pl.col("Instruction") == "jmp").item()
        and row_minus12.select(pl.col("Instruction") == "jmp").item()
    )

    # Check condition 3: Neighbors at +6 and +12, both with Instruction="jmp"
    addr_plus12 = base_addr + 12
    row_plus12 = get_row_by_address(addr_plus12)

    condition3 = (
        len(row_plus6) > 0
        and len(row_plus12) > 0
        and row_plus6.select(pl.col("Instruction") == "jmp").item()
        and row_plus12.select(pl.col("Instruction") == "jmp").item()
    )

    # If none of the conditions are met
    return condition1 | condition2 | condition3

In [None]:
df = calls_table.filter(pl.col("Instruction") == "jmp")
df = df.with_columns(
    pl.col("Call address").str.slice(2).str.to_integer(base=16).alias("call int")
)
df = df.sort("call int")
df = df.with_columns(
    (pl.col("call int").shift(-1) - pl.col("call int")).alias("to next")
)
df = df.with_columns(
    (pl.col("call int") - pl.col("call int").shift(1)).alias("to prev")
)
df = df.with_columns(
    pl.Expr.and_(pl.col("to prev") == 6, pl.col("to next") == 6).alias("is_thunk")
)
df = df.with_columns(pl.Expr.or_(pl.col("is_thunk"), pl.col("is_thunk").shift(1)))
df = df.with_columns(pl.Expr.or_(pl.col("is_thunk"), pl.col("is_thunk").shift(-1)))


# exceptions
df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14aaadb")
)
df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14aaad5")
)

df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14aa9f5")
)
df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14aa9fb")
)

df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14aab3d")
)

df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14ab35d")
)
df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14ab363")
)

df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14ab405")
)
df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14ab40b")
)

df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14ab767")
)
df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14ab76d")
)

df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14abc33")
)
df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14abc39")
)

df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0x14abe61")
)

df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0xcedcb1")
)

df = df.with_columns(
    pl.Expr.or_(pl.col("is_thunk"), pl.col("Call address") == "0xcedcf1")
)
# end of exceptions

thunks = df.filter("is_thunk")
thunks.shape

In [None]:
thunks = thunks.with_columns(
    pl.col("Destination").str.slice(2).str.to_integer(base=16).alias("Addr_int")
)
thunks = thunks.join(
    functions_table.select("Module", "Function", "Addr_int"), on="Addr_int", how="left"
)
thunks = thunks.drop(
    "Resolved name",
    "Dest_int",
    "call int",
    "to next",
    "to prev",
    "Addr_int",
    "is_thunk",
    "function",
    "Instruction",
)

print(thunks.shape)
thunks.head(5)

In [None]:
thunks.filter(pl.col("Module").is_null())

### Not enough thunks (adding a few from old IAT)

In [None]:
iat_entries = pl.read_csv(old_iat_csv)
iat_entries = iat_entries.filter(pl.col("Destination") != "00000000")
iat_entries = iat_entries.filter(pl.col("Address") != "0x01588AB0")  # broken entry

no_thunks: list[str] = []

for addr, dest in iat_entries.rows():
    thunk = thunks.filter(pl.col("Destination") == hex(int(dest, 16)))
    if thunk.shape[0] == 0:
        dup = "Duplicated e"
        if dest not in no_thunks:
            dup = "E"
            no_thunks.append(dest)
        print(f"{dup}ntry {dest} from old iat at {addr} is not found among thunks")

In [None]:
available_thunk_places = [
    ["0x014AB2A4", 12],
    ["0x014AB368", 8],
    ["0x014AB3C2", 14],
    ["0x014AB5B6", 10],
    ["0x014AB602", 14],
]


def find_next_addr(size: int = 6) -> str | None:
    global available_thunk_places
    for i in range(len(available_thunk_places)):
        e = available_thunk_places[i]
        if e[1] >= size:
            e[1] -= size
            retval = int(e[0], 16)
            e[0] = hex(retval + size)
            return hex(retval)
    return None


thunks = thunks.with_columns(pl.lit(False).alias("new"))
for dest in no_thunks:
    addr = find_next_addr()
    print(f"Using {addr} to place a new thunk")
    if addr is None:
        raise RuntimeError("Can't place new thunk: no available space")

    func = functions_table.filter(pl.col("Address") == dest)
    thunks = thunks.vstack(
        pl.DataFrame(
            {
                "Call address": hex(int(addr, 16)),
                "Destination": hex(int(dest, 16)),
                "Module": func["Module"][0],
                "Function": func["Function"][0],
                "new": True,
            }
        )
    )

no_thunks = []
print(thunks.shape)

### Check and save

In [None]:
dests = set(thunks["Destination"])
len(dests), list(dests)[:10]

In [None]:
# check all the functions have thunks
df = result.filter(~pl.col("Destination").is_in(dests))
# filter(pl.col("Instruction") == "jmp")
n_unthunked = df.sort("Call address").shape[0]
assert 0 == n_unthunked, f"Some({n_unthunked}) calls are not thunked!"

In [None]:
thunks.write_csv(fn_thunks)

# Building new IDT for the old IAT

In [None]:
old_iat = iat_entries.with_columns(
    pl.col("Destination").str.to_integer(base=16).alias("Addr_int")
)
old_iat = old_iat.join(functions_table.drop("Address"), on="Addr_int", how="left")
old_iat = old_iat.sort("Address").drop("Destination").drop("Addr_int")

print(old_iat.shape)
old_iat.head()

In [None]:
old_iat.write_csv(new_idt_path)

# Stop here and run patch_iat.ipynb

Or patch_idt.ipynb (if you're restoring the original idt)

In [None]:
raise RuntimeError("Pause here!")

In [None]:
pass

# Constructing patch

## Patching calls and jmps

Sets their new address to thunks

In [None]:
def hex_to_BE(num: int) -> bytearray:
    # Convert to 32-bit two's complement
    num_32bit = num & 0xFFFFFFFF
    # Format as 8-character hex string without '0x' prefix
    hex_str = format(num_32bit, "08x")
    return bytearray.fromhex(hex_str)


def hex_to_LE(num: int) -> bytearray:
    BE = hex_to_BE(num)
    BE.reverse()
    return BE

In [None]:
def rel_call(src: str, dst: str) -> bytearray:
    src_str = hex(int(src, base=16) + 5)
    rel = int(dst[2:], base=16) - int(src_str[2:], base=16)
    return hex_to_LE(rel)


def create_jumpcall(src: str, dst: str) -> bytearray:
    thunk_search = thunks.filter(pl.col("Destination") == dst)["Call address"]
    if thunk_search.shape[0] < 1:
        raise RuntimeError(f"Failed to find thunk for src:{src}, dst:{dst}")

    thunk_addr = int(thunk_search[0][2:], 16) - 1
    return rel_call(src, hex(thunk_addr))


def to_bin(bt: bytearray) -> str:
    LE = b"\x00" * (4 - len(bt)) + bt
    s = ""
    for c in LE:
        c = hex(c)[2:]
        c = "0" * (2 - len(c)) + c
        s += c
    return s.upper()


def from_bin(binary: str) -> int:
    return int.from_bytes(bytes.fromhex(binary)[::-1])

In [None]:
src = "0x14abeb6"
dest = result.filter(pl.col("Call address") == src)["Destination"][0]
thunk = thunks.filter(pl.col("Destination") == dest)["Call address"][0]
thunk = hex(int(thunk[2:], 16) - 1)
src, dest, thunk

In [None]:
binary = to_bin(rel_call(src, thunk))
delta = from_bin(binary)
hex(int(src, 16) + delta + 5)

In [None]:
(
    to_bin(create_jumpcall("0x40fcb1", "0x10009940")),
    create_jumpcall("0x40fcb1", "0x10009940"),
    hex(0xF610CF - 0x40FCB1),
)

In [None]:
schema = {
    "patch_addr": pl.String,
    "mem_old": pl.String,
    "patch": pl.String,
}


def optimize_patch_generation(result, thunks):
    # Precompute thunk addresses as a set for O(1) lookups
    thunk_addresses = set(thunks["Call address"])

    # Use list comprehensions to collect data
    patch_data = []

    for call in tqdm(result.rows()):
        func, src, inst, dest, mod, res_name = call

        # Fast set lookup instead of DataFrame filter
        if src in thunk_addresses:
            continue

        # Determine opcode based on instruction
        opcode = "E8" if inst == "call" else "E9"

        rel_call_bin = to_bin(rel_call(src, dest))
        jumpcall_bin = to_bin(create_jumpcall(src, dest))

        patch_data.append(
            {
                "patch_addr": src,
                "mem_old": opcode + rel_call_bin,
                "patch": opcode + jumpcall_bin,
            }
        )

    # Create DataFrame in one operation
    return pl.DataFrame(patch_data, schema=schema)


patch = optimize_patch_generation(result, thunks)

In [None]:
patch.write_csv(calls_patch_path)

## Patching thunks

Finds the IAT in the target executable and binds thunks to it

In [None]:
import lief

pe = lief.PE.parse(out_path)

In [None]:
def get_addr(entry: lief.PE.ImportEntry, imports: pl.DataFrame) -> str:
    name = entry.name
    if entry.is_ordinal:
        name = f"Ordinal#{entry.ordinal}"
    return imports.filter(pl.col("Function") == name)["Call address"][0]


def get_import(name: str, pe: lief.PE.Binary) -> lief.PE.ImportEntry | None:
    for mod in pe.imports:
        for entry in mod.entries:
            if entry.name == name:
                return entry
    return None


def get_import_by_ordinal(module: str, ordinal: int) -> lief.PE.ImportEntry | None:
    for mod in pe.imports:
        if mod.name != module:
            continue
        for entry in mod.entries:
            if entry.ordinal == ordinal:
                return entry
    return None

In [None]:
# thunk_loc = '0x14ab2a4'
# dll = "msvcp90.dll"
# imports = thunks.filter(pl.col("Module") == dll)

# fun1 = thunks.filter(pl.col("Call address") == thunk_loc)['Function'][0]
# fun2 = thunks.filter(pl.col("Call address") == hex(int(thunk_loc, 16) + 6))['Function'][0]

# fun1 == fun2

In [None]:
"""
For each module in IDT, for each entry in that module
finds the referring thunk, and creates a patch for that thunk.
"""

thunks_patch = pl.DataFrame(schema=schema)

dlls = thunks["Module"].unique()
for mod in pe.imports:
    dll = mod.name

    imports = thunks.filter(pl.col("Module") == dll)
    for ientry in mod.entries:
        try:
            addr = get_addr(ientry, imports)
        except IndexError:
            print(f"Not found thunk to {dll}!{ientry.name}")
            continue

        thentry = thunks.filter(pl.col("Call address") == addr)
        dest = thentry["Destination"][0]
        new = thentry["new"][0]
        iat_entry_rva = ientry.iat_address + pe.imagebase

        if new:
            thunks_patch = thunks_patch.vstack(
                pl.DataFrame(
                    {
                        "patch_addr": hex(int(addr, base=16)),
                        "mem_old": "CC" * 6,
                        "patch": "FF25" + to_bin(hex_to_LE(iat_entry_rva)),
                    },
                    schema=schema,
                )
            )
        else:
            thunks_patch = thunks_patch.vstack(
                pl.DataFrame(
                    {
                        "patch_addr": hex(int(addr, base=16) - 1),  # nop before jmp
                        "mem_old": "90E9" + to_bin(rel_call(addr, dest)),
                        "patch": "FF25" + to_bin(hex_to_LE(iat_entry_rva)),
                    },
                    schema=schema,
                )
            )

thunks_patch.shape

In [None]:
thunks_patch.write_csv(thunks_patch_path)

## Patching old IAT (better put it to use)

Skip this if you restored original IAT

In [None]:
skip: bool = True

In [None]:
old_iat_patch = pl.DataFrame(schema=schema)

if not skip:
    for addr, dest in iat_entries.rows():
        thunk = thunks.filter(pl.col("Destination") == hex(int(dest, 16)))
        if thunk.shape[0] == 0:
            print(f"Entry {dest} from old iat at {addr} is not found among thunks")
            continue

        funcname = thunk["Function"][0]
        if "Ordinal#" in funcname:
            entry = get_import_by_ordinal(
                thunk["Module"][0], int(funcname.removeprefix("Ordinal#"))
            )
        else:
            entry = get_import(funcname, pe)

        if entry is None:
            raise RuntimeError(
                f"ImportEntry {thunk['Function'][0]} from module {thunk['Module'][0]} is not found in PE IDT"
            )
        iat_entry_rva = entry.iat_address + pe.imagebase

        mem_old = to_bin(hex_to_LE(int(dest, 16)))
        mem_new = to_bin(hex_to_LE(iat_entry_rva))

        old_iat_patch = old_iat_patch.vstack(
            pl.DataFrame(
                {
                    "patch_addr": hex(int(addr, base=16)),
                    "mem_old": mem_old,
                    "patch": mem_new,
                }
            )
        )

old_iat_patch.write_csv(old_iat_patch_path)

# Troubleshooting

In [None]:
new_imports = pl.DataFrame(schema={"Module": pl.String, "Function": pl.String})

for dll in pe.imports:
    for impo in dll.entries:
        name = impo.name if not impo.is_ordinal else str(impo.ordinal)
        new_imports = new_imports.vstack(
            pl.DataFrame(
                {
                    "Module": dll.name,
                    "Function": name,
                }
            )
        )
new_imports.shape, iat_entries.shape

In [None]:
old_iat = iat_entries.with_columns(
    pl.col("Destination").str.to_integer(base=16).alias("Addr_int")
)
old_iat = old_iat.join(functions_table.drop("Address"), on="Addr_int", how="left").drop(
    "Addr_int"
)

old_iat.shape

In [None]:
pl.Config(tbl_rows=10)

In [None]:
old_iat = old_iat.with_columns(
    (pl.col("Module").shift(1) != pl.col("Module")).alias("skip")
)
old_iat.filter(pl.col("skip")).shape, old_iat.select("Module").unique().shape

In [None]:
for skip in old_iat.filter(pl.col("skip")).select("Module").to_series():
    print(skip)

In [None]:
for mod in old_iat.select("Module").unique().to_series():
    print(mod)