In [15]:
import pandas as pd
from gensim import utils
from gensim.models import FastText
from razdel import tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
from feature_extractor import FeatureExtractor
import math
import matplotlib.pyplot as plt

In [16]:
ext = FeatureExtractor(
    town_index_path="/work/hack/additional_data/town_20230808.csv",
    district_index_path="/work/hack/additional_data/district_20230808.csv",
    street_abbv_index_path="/work/hack/additional_data/geonimtype_20230808.csv",
    town_abbv_index_path="/work/hack/additional_data/subrf_20230808.csv"
)

In [17]:
data = pd.read_csv("/work/hack/datasets/dataset_1.csv")

for i in range(1, 5):
    new_data = pd.read_csv(f"/work/hack/datasets/dataset_{i}.csv")
    data = pd.concat([data, new_data])

In [18]:
building = pd.read_csv("/work/hack/additional_data/building_20230808.csv")
valset = pd.read_csv("/work/hack/test_example.csv", sep=";")

  building = pd.read_csv("/work/hack/additional_data/building_20230808.csv")


In [19]:
ext = FeatureExtractor(
    town_index_path="/work/hack/additional_data/town_20230808.csv",
    district_index_path="/work/hack/additional_data/district_20230808.csv",
    street_abbv_index_path="/work/hack/additional_data/geonimtype_20230808.csv",
    town_abbv_index_path="/work/hack/additional_data/subrf_20230808.csv"
)

In [20]:
building = building[building["is_actual"] == True]

In [21]:
trainset = building["short_address"].tolist()
trainset = trainset + building["full_address"].tolist()
trainset = trainset + data[data["address"].notna()]["address"].tolist()
trainset = trainset + data[data["target_address"].notna()]["target_address"].tolist()

In [22]:
trainset = [ext.clear_text(i) for i in trainset]

In [23]:
# class MyIter:
#     def __iter__(self):
#         for index, row in building.iterrows():
#             tokens = list(tokenize(ext.resolve_abbv(row["full_address"])))
#             yield [_.text for _ in tokens]

class MyIter:
    def __iter__(self):
        for row in trainset:
            tokens = list(tokenize(row))
            yield [_.text for _ in tokens]      

model = FastText(vector_size=256, window=7, min_count=5, workers=30, sg=1, negative=25, seed=42)
model.build_vocab(corpus_iterable=MyIter())
total_examples = model.corpus_count
model.train(corpus_iterable=MyIter(), total_examples=total_examples, epochs=20)

(27792145, 57928220)

In [24]:
vector_index = []

for index, row in tqdm(building.iterrows()):
    tokens = list(tokenize(ext.clear_text(row['full_address'])))
    tokens = [_.text for _ in tokens]
    predict = np.array([model.wv[token] for token in tokens])
    predict = np.mean(predict, axis=0)
    predict = predict / np.linalg.norm(predict)
    vector_index.append((row['id'], predict))

133462it [00:30, 4320.42it/s]


In [25]:
def get_id(text, target_id):
    index = [i[0] for i in vector_index].index(target_id)
    vector = model.wv[text] 
    tokens = list(tokenize(text))
    tokens = [_.text for _ in tokens]
    vector = np.array([model.wv[token] for token in tokens])
    vector = np.mean(vector, axis=0)
    vector = vector / np.linalg.norm(vector).reshape(1, -1) 
    max_sym = -1
    best_id = 0
    step = 10000

    all_syms = np.ones((1, 1))
    
    for ind in range(0, len(vector_index), step):
        vectors = np.array([i[1] for i in vector_index[ind: ind + step]]).T
        syms = vector @ vectors
        all_syms = np.hstack([all_syms, syms])
        best_local_ind = np.argmax(syms)
        sym = syms[0][best_local_ind]
        if sym > max_sym:
            max_sym = sym
            best_id = vector_index[ind + best_local_ind][0]
    all_syms = all_syms[:, 1:]
    target_sym = all_syms[0][index]
    top_k = np.sum(np.where(all_syms > target_sym, True, False))
    
    return best_id, max_sym, top_k

In [26]:
cnt = 0
ind = 0
top_k_s = []
bads = []
for index, row in tqdm(valset.iterrows()):
    ind += 1
    if math.isnan(row['target_building_id']):
        continue
    best_id, max_sym, top_k = get_id(ext.clear_text(row['address']), row['target_building_id'])
    top_k_s.append(top_k)
    if top_k > 300:
        bads.append((best_id, max_sym, row['target_building_id'], index, top_k))
    if best_id == row['target_building_id']:
        cnt += 1
    # if ind == 100:
    #     break
print(cnt / len(valset), sum(top_k_s) / len(top_k_s))

0it [00:00, ?it/s]

328it [00:42,  7.72it/s]

0.6615853658536586 1038.219512195122





In [27]:
def get_top_n(text, n):
    vector = model.wv[text] 
    tokens = list(tokenize(text))
    tokens = [_.text for _ in tokens]
    vector = np.array([model.wv[token] for token in tokens])
    vector = np.mean(vector, axis=0)
    vector = vector / np.linalg.norm(vector).reshape(1, -1) 
    max_sym = -1
    best_id = 0
    step = 10000

    all_syms = np.ones((1, 1))
    
    for ind in range(0, len(vector_index), step):
        vectors = np.array([i[1] for i in vector_index[ind: ind + step]]).T
        syms = vector @ vectors
        all_syms = np.hstack([all_syms, syms])
        best_local_ind = np.argmax(syms)
        sym = syms[0][best_local_ind]
        if sym > max_sym:
            max_sym = sym
            best_id = vector_index[ind + best_local_ind][0]
    all_syms = all_syms[:, 1:]
    top_k = np.argpartition(all_syms[0], -n)[-n:].tolist()
    top_k = [(vector_index[i][0], all_syms[0][i]) for i in top_k]
    
    return best_id, max_sym, top_k

In [28]:
cnt = 0
ind = 0
top_k_s = []
bads = []
for index, row in tqdm(valset.iterrows()):
    ind += 1
    if math.isnan(row['target_building_id']):
        continue
    best_id, max_sym, top_k = get_top_n(ext.clear_text(row['address']), 300)
    top_k_inds = [i[0] for i in top_k]
    map_local_to_global = {i: top_k_inds[i] for i in range(len(top_k_inds))}
    map_global_to_local = {top_k_inds[i]: i for i in range(len(top_k_inds))}
    data_top_k_inds = building[building['id'].isin(top_k_inds)]
    features = ext.get_features(row['address'])
    for feature in features:
        if features[feature] is not None and \
        feature not in ["src_text", "preproc_text", "street", "structure", "country"]:
            data_top_k_inds = data_top_k_inds[(data_top_k_inds[feature] == features[feature]) | (data_top_k_inds[feature].isna())]
    if len(data_top_k_inds) != 0:
        data_top_k_inds['sim'] = data_top_k_inds['id'].apply(lambda x: top_k[map_global_to_local[x]][1])
        best_id = data_top_k_inds['id'].iloc[data_top_k_inds['sim'].argmax()]
    if best_id == row['target_building_id']:
        cnt += 1
print(cnt / len(valset))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_top_k_inds['sim'] = data_top_k_inds['id'].apply(lambda x: top_k[map_global_to_local[x]][1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_top_k_inds['sim'] = data_top_k_inds['id'].apply(lambda x: top_k[map_global_to_local[x]][1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_top_k_

0.6951219512195121



