## Импорт библиотек

In [None]:
!apt install libomp-dev
!pip install faiss-gpu
!pip install adjdatatools

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
import faiss
from faiss import write_index, read_index
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from adjdatatools.preprocessing import AdjustedScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings
warnings.simplefilter("ignore")

## Знакомство с данными

### Вспомогательные функции

In [2]:
def graph(df, color):
    features = list(df.columns)
    f_size = (10, 200)
    plt.figure(figsize=f_size)
    for i, feature in enumerate(features):
        plt.subplot(36, 2, i+1)
        plt.subplots_adjust(wspace=0.3, hspace=0.7)
        df[feature].hist(bins=100, color=color)
        plt.title(feature)
        plt.xlabel("Значение признака")
        plt.ylabel("Количество")
    return

In [3]:
def hello(df, color):
    print('Пример строк \n')
    display(df.head(5))
    print('Размер фрейма:', df.shape)
    print('Количество дубликатов:', df.duplicated().sum())
    print('Количество пропусков:', df.isnull().sum().sum ())
    print('Корреляция признаков \n')
    display(df.describe())
    print('Визуализация признаков \n')
    graph(df, color)
    return

### Base dataset

In [72]:
df_base = pd.read_csv("/Users/sirena0789/Desktop/Матчинг/data/base.csv", index_col=0)

In [None]:
hello(df_base, 'pink')

### Train dataset

In [73]:
df_train = pd.read_csv("/Users/sirena0789/Desktop/Матчинг/data/train.csv", index_col=0)

In [None]:
hello(df_train, 'lightgray')

### Validation dataset

In [109]:
df_valid = pd.read_csv("/Users/sirena0789/Desktop/Матчинг/data/validation.csv", index_col=0)

In [None]:
hello(df_valid, 'lightblue')

### Вывод
Всем признакам необходима нормализация, а признакам под номерами 6, 21, 25, 33, 44, 59, 65, 70 еще нужна стандартизация.

## Работа с признаками

In [92]:
def st_scaler(df_base, df_train):
    scaler = StandardScaler()
    scaler.fit(df_base)
    df_base_scaled  = scaler.transform(df_base)
    df_train_scaled = scaler.transform(df_train)
    df_base_scaled = pd.DataFrame(df_base_scaled, columns = df_base.columns, index=df_base.index)
    df_train_scaled = pd.DataFrame(df_train_scaled, columns = df_train.columns, index=df_train.index)
    return df_base_scaled, df_train_scaled

In [66]:
def minmax_scaler(df_base, df_train):
    scaler = MinMaxScaler()
    scaler.fit(df_base)
    df_base_scaled  = scaler.transform(df_base)
    df_train_scaled = scaler.transform(df_train)
    df_base_scaled = pd.DataFrame(df_base_scaled, columns = df_base.columns)
    df_train_scaled = pd.DataFrame(df_train_scaled, columns = df_train.columns)
    return df_base_scaled, df_train_scaled

In [20]:
def adj_scaler(df_base, df_train):
    scaler = AdjustedScaler()
    scaler.fit(df_base)
    df_base  = scaler.transform(df_base)
    df_train = scaler.transform(df_train)
    return df_base, df_train

In [75]:
# выделение цел. признака в отдельный датасет
targets = df_train["Target"]
df_train.drop("Target", axis=1, inplace=True)

In [93]:
df_base_st = df_base
df_base_minmax = df_base
df_base_adj = df_base
df_train_st = df_train
df_train_minmax = df_train
df_train_adj = df_train

In [94]:
%%time

df_base_st, df_train_st = st_scaler(df_base_st, df_train_st)

CPU times: user 1.49 s, sys: 2.74 s, total: 4.23 s
Wall time: 8.16 s


In [23]:
%%time

df_base_minmax, df_train_minmax = minmax_scaler(df_base_minmax, df_train_minmax)

CPU times: user 746 ms, sys: 1.23 s, total: 1.98 s
Wall time: 2.46 s


In [24]:
%%time

df_base_adj, df_train_adj = adj_scaler(df_base_adj, df_train_adj)

CPU times: user 1min 37s, sys: 1min, total: 2min 37s
Wall time: 2min 44s


In [27]:
base_index = {k: v for k, v in enumerate(df_base.index.to_list())}

## Create index

In [79]:
%%time


dimensions = df_base_st.shape[1] # n-мерность векторов
idx_l2 = faiss.IndexFlatL2(dimensions)

# обучение делителя пространства на первых 50 тысячах объектов из df_base
idx_l2.train(np.ascontiguousarray(df_base_st.values).astype('float32'))
# деление всех векторов по правилам, сформированным на предыдущем шаге
idx_l2.add(np.ascontiguousarray(df_base_st.values).astype('float32'))


top_count = 10 # количество кандидатов на сопоставление
vectors_st, idx_st = idx_l2.search(np.ascontiguousarray(df_train_st.values).astype('float32'), top_count)

acc_st = 0
for target, candidates in zip(targets.values.tolist(), idx_st.tolist()):
    acc_st += int(target in [base_index[number] for number in candidates])

print('accuracy StandardScaler : {}%'.format(acc_st / len(idx_st) * 100))

accuracy StandardScaler : 69.789%
CPU times: user 46min 8s, sys: 2min 27s, total: 48min 36s
Wall time: 6min 24s


In [36]:
%%time


dimensions = df_base_adj.shape[1] # n-мерность векторов
n_cells = 10 # кол-во кластеров на которые делится всё векторное пространство
idx_l2 = faiss.IndexFlatL2(dimensions)
#idx_l2 = faiss.IndexIVFFlat(quantizer, dimensions, n_cells)

# обучение делителя пространства на первых 50 тысячах объектов из df_base
idx_l2.train(np.ascontiguousarray(df_base_adj.values).astype('float32'))
# деление всех векторов по правилам, сформированным на предыдущем шаге
idx_l2.add(np.ascontiguousarray(df_base_adj.values).astype('float32'))


top_count = 20 # количество кандидатов на сопоставление
vectors_adj, idx_adj = idx_l2.search(np.ascontiguousarray(df_train_adj.values).astype('float32'), top_count)

acc_adj = 0
for target, candidates in zip(targets.values.tolist(), idx_adj.tolist()):
    acc_adj += int(target in [base_index[number] for number in candidates])

print('accuracy adj: {}%'.format(acc_adj / len(idx_adj) * 100))

NameError: name 'acc_adj' is not defined

## Создание датафрейма для обучения

In [108]:
def one_df(j): 
    final_df = pd.DataFrame(idx_st[j])
    final_df.columns = ['inx']

    def index_get(i):
        return base_index[i['inx']]

    final_df['base_inx'] = final_df.apply(index_get, axis=1)
    final_df['query_inx'] = str(j) + '-query'
    final_df = final_df.merge(df_base_st, how='inner', left_on='base_inx', right_index=True)
    final_df = final_df.merge(df_train_st, how='inner', left_on='query_inx', right_index=True)
    final_df['target_inx'] = targets[j]
    final_df['target'] = final_df['target_inx'] == final_df['base_inx']
    final_df['target'] = final_df['target'].astype('int')
    return final_df

In [120]:
def fin_df():
    new_df = pd.DataFrame()
    new_df = pd.concat([new_df] + list(map(one_df, range(9999))),ignore_index=True)
    return new_df       

In [None]:
new_df = fin_df()

In [None]:
new_df