## **Этот ноутбук**: использует сгенерированной моделью Qwen запросы и добавляет в них опечатки. Так запросы будут ближе к реальным.

In [None]:
import pandas as pd

df_1 = pd.read_parquet("/content/queries_and_desc (2).parquet")
df_2 = pd.read_parquet("/content/queries_and_desc.parquet")

In [None]:
df = pd.concat([df_1, df_2], ignore_index=True)

In [None]:
df = df[df['queries']!='']

In [None]:
sum(df['queries'].isna())

1226

In [None]:
df["queries"] = df["queries"].drop_duplicates()
df["queries"] = df["queries"].fillna('<missed>')
df = df[df["queries"]!='<missed>']

In [None]:
import re
import random
import numpy as np
import pandas as pd

# ---------------------------------------
# Настройки
# ---------------------------------------

# Распределение типов "шума" (в сумме 1.0)
TYPO_DISTRIBUTION = {
    "none":    0.70,  # без ошибки
    "neighbor":0.15,  # замена на соседнюю
    "double":  0.05,  # удвоение буквы
    "drop":    0.05,  # удаление буквы
    "swap":    0.03,  # перестановка соседних
    "extra":   0.02,  # лишний символ
}

RANDOM_SEED = 1  # убери/измени для нерепродюсируемого рандома
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# ---------------------------------------
# Вспомогательные штуки
# ---------------------------------------

# qwerty-соседи (минимально достаточная карта; можно расширить)
QWERTY_NEIGHBORS = {
    'q':'w','w':'qe','e':'wr','r':'et','t':'ry','y':'tu','u':'yi','i':'uo','o':'ip','p':'o',
    'a':'qs','s':'adw','d':'sef','f':'dgr','g':'fht','h':'gjy','j':'huk','k':'jl','l':'k',
    'z':'xs','x':'zsdc','c':'xdfv','v':'cfgb','b':'vhn','n':'bjm','m':'n'
}

ALPHA_RE = re.compile(r"[A-Za-z]")

def _pick_token_for_typo(text):
    """Выбираем слово для опечатки: латинское, длиной >= 3."""
    tokens = re.findall(r"\b[ A-Za-z'-]+\b", text)
    candidates = []
    idx = 0
    for t in tokens:
        start = text.find(t, idx)
        if start == -1:
            continue
        end = start + len(t)
        clean = re.sub(r"[^A-Za-z]", "", t)
        if len(clean) >= 3:
            candidates.append((start, end, t))
        idx = end
    if not candidates:
        return None
    return random.choice(candidates)

def _replace_char(s, pos, new_char):
    return s[:pos] + new_char + s[pos+1:]

def _insert_char(s, pos, new_char):
    return s[:pos] + new_char + s[pos:]

# ---------------------------------------
# Опечатки
# ---------------------------------------

def typo_neighbor(text):
    """Замена буквы на соседнюю по раскладке."""
    pick = _pick_token_for_typo(text)
    if not pick: return text
    start, end, token = pick
    # ищем букву для замены
    for _ in range(20):
        i = random.randrange(len(token))
        ch = token[i]
        ch_low = ch.lower()
        if ch_low in QWERTY_NEIGHBORS and ALPHA_RE.match(ch):
            neighs = QWERTY_NEIGHBORS[ch_low]
            repl = random.choice(list(neighs))
            repl = repl.upper() if ch.isupper() else repl
            return text[:start] + token[:i] + repl + token[i+1:] + text[end:]
    return text

def typo_double(text):
    """Двойная буква."""
    pick = _pick_token_for_typo(text)
    if not pick: return text
    start, end, token = pick
    idxs = [i for i,c in enumerate(token) if ALPHA_RE.match(c)]
    if not idxs: return text
    i = random.choice(idxs)
    return text[:start] + token[:i+1] + token[i] + token[i+1:] + text[end:]

def typo_drop(text):
    """Удаление одной буквы."""
    pick = _pick_token_for_typo(text)
    if not pick: return text
    start, end, token = pick
    idxs = [i for i,c in enumerate(token) if ALPHA_RE.match(c)]
    if len(idxs) <= 1: return text
    i = random.choice(idxs)
    return text[:start] + token[:i] + token[i+1:] + text[end:]

def typo_swap(text):
    """Перестановка соседних букв."""
    pick = _pick_token_for_typo(text)
    if not pick: return text
    start, end, token = pick
    positions = [i for i in range(len(token)-1) if ALPHA_RE.match(token[i]) and ALPHA_RE.match(token[i+1])]
    if not positions: return text
    i = random.choice(positions)
    swapped = token[:i] + token[i+1] + token[i] + token[i+2:]
    return text[:start] + swapped + text[end:]

def typo_extra(text):
    """Вставка лишнего символа (сосед по клавиатуре, если повезёт)."""
    pick = _pick_token_for_typo(text)
    if not pick: return text
    start, end, token = pick
    # выберем позицию и символ рядом с предыдущей буквой
    i = random.randrange(1, len(token))  # не в самое начало чаще
    prev = token[i-1].lower() if ALPHA_RE.match(token[i-1]) else None
    if prev and prev in QWERTY_NEIGHBORS:
        ch = random.choice(list(QWERTY_NEIGHBORS[prev]))
    else:
        ch = random.choice(list("abcdefghijklmnopqrstuvwxyz"))
    ch = ch.upper() if token[i-1].isupper() else ch
    return text[:start] + token[:i] + ch + token[i:] + text[end:]

TYPO_FUNCS = {
    "none":   lambda s: s,
    "neighbor": typo_neighbor,
    "double": typo_double,
    "drop":   typo_drop,
    "swap":   typo_swap,
    "extra":  typo_extra,
}

# ---------------------------------------
# Присвоение типа ошибки и инъекция
# ---------------------------------------

def assign_typo_types(df, distribution=TYPO_DISTRIBUTION, col_name="typo_type"):
    """
    Создаёт колонку с типом опечатки согласно заданному распределению.
    """
    types = list(distribution.keys())
    probs = np.array(list(distribution.values()), dtype=float)
    probs = probs / probs.sum()
    df[col_name] = np.random.choice(types, size=len(df), p=probs)
    return df

def inject_typo_by_row(row, text_col="query", type_col="typo_type", out_col="query_noisy"):
    """
    По типу опечатки в строке применяет соответствующую функцию к тексту.
    """
    text = row[text_col]
    ttype = row[type_col]
    func = TYPO_FUNCS.get(ttype, lambda s: s)
    noisy = func(text)
    return noisy

def apply_typos(df, text_col="query", type_col="typo_type", out_col="query_noisy"):
    """
    Возвращает DF с новой колонкой out_col, где к text_col применена опечатка из type_col.
    """
    df[out_col] = df.apply(lambda r: inject_typo_by_row(r, text_col, type_col, out_col), axis=1)
    return df

# ---------------------------------------
# Пример использования
# ---------------------------------------

# df исходный: должен содержать колонку "query" (сгенерированный запрос/фраза)
# 1) Проставляем тип опечатки с нужными долями
df = assign_typo_types(df, TYPO_DISTRIBUTION, col_name="typo_type")

# 2) Применяем опечатки и пишем результат в "query_noisy"


# Теперь df имеет:
# - query        (оригинал)
# - typo_type    (какой шум/ошибка назначена)
# - query_noisy  (итог с опечаткой или без)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = np.random.choice(types, size=len(df), p=probs)


In [None]:
df

Unnamed: 0,desc,prompt,queries,typo_type
0,"genre: war, not_specified, not_specified\nsubg...",You will receive a compact movie description b...,"Tense, suspenseful post-apocalyptic war film a...",none
1,"genre: horror, not_specified, not_specified\ns...",You will receive a compact movie description b...,"dark, tense, suspenseful present-day horror re...",neighbor
2,"genre: musical, romance, not_specified\nsubgen...",You will receive a compact movie description b...,"A present-day romantic, uplifting tale of love...",none
3,"genre: crime, mystery, not_specified\nsubgenre...",You will receive a compact movie description b...,A desert tale of love and justice battling aga...,none
4,"genre: family, drama, not_specified\nsubgenre:...",You will receive a compact movie description b...,Melancholic yet hopeful suburban rescue tale s...,none
...,...,...,...,...
72427,"genre: horror, sci_fi, not_specified\nsubgenre...",You will receive a compact movie description b...,"Present-day tense, suspenseful horror where he...",none
72428,"genre: romance, not_specified, not_specified\n...",You will receive a compact movie description b...,bittersweet tragic romance involving a parent-...,none
72429,"genre: action, thriller, not_specified\nsubgen...",You will receive a compact movie description b...,"A gritty, tense present-day revenge storyline ...",none
72430,"genre: thriller, crime, romance\nsubgenre: not...",You will receive a compact movie description b...,"Present-day tense, suspenseful thriller where ...",double


In [None]:
df = apply_typos(df, text_col="queries", type_col="typo_type", out_col="queries_with_mistackes")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[out_col] = df.apply(lambda r: inject_typo_by_row(r, text_col, type_col, out_col), axis=1)


In [None]:
show_ = df.groupby("typo_type").head(1).iloc[:, -3:]
show_

Unnamed: 0,queries,typo_type,queries_with_mistackes
0,"Tense, suspenseful post-apocalyptic war film a...",none,"Tense, suspenseful post-apocalyptic war film a..."
1,"dark, tense, suspenseful present-day horror re...",neighbor,"eark, tense, suspenseful present-day horror re..."
13,"Neo-noir crime thriller set in the city, explo...",double,"Neo-noir crime thriller set in the cityy, expl..."
21,Outsider battles supernatural madness in a ten...,swap,Outsidre battles supernatural madness in a ten...
40,Present-day thriller about betrayal and family...,extra,Present-day thriller about bhetrayal and famil...
46,bittersweet satirical romance with betrayal an...,drop,bittersweet satrical romance with betrayal and...


In [None]:
df.to_parquet("queries_desc_q_with_mistackes.parquet")

# Отлично! У нас, используя трансформеры и ловкость рук, получилось создать 70к натуральных запросов, которые очень близки к реальным как по стилю, так и по опечаткам. Они могут быть пригодны для анализа или обучение моделей-retrievel. Суммарно, на генерацию 70к запрос ушло: 7 часов на 2 бесплатных GPU с Kaggle.