In [1]:
import tqdm
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors

In [137]:
def precission(y_hat, y_true):
    tp = np.sum(y_true[y_hat == 1])
    tot_p = np.sum(y_hat)
    
    return tp / tot_p

def recall(y_hat, y_true):
    tp = np.sum(y_true[y_hat == 1])
    fn = np.sum(y_true[y_hat==0])
    
    return tp / (tp + fn)

In [3]:
ds = pd.read_csv('data/hw02/radar_points.txt')
ds.seq = ds.seq.astype(int)
ds.object_label = ds.object_label.astype(int)
ds.belongs_to_object = ds.belongs_to_object.astype(int)
ds.head()

Unnamed: 0,id,stamp,seq,x,y,z,probability,relative_radial_velocity,relative_lateral_velocity,cross_section,distance_rms,angle_rms,radial_velocity_rms,is_cylindrical,absolute_radial_velocity,belongs_to_object,object_label
0,0,1523894000.0,52,476.396881,559.627075,0.3,0.999,0.0,0.0,-9.0,0.0,0.0,0.0,0.0,0.0,0,0
1,1,1523894000.0,52,479.438354,566.93103,0.3,0.25,1.100242,0.0,-3.5,0.0,0.0,0.0,0.0,1.100242,0,0
2,2,1523894000.0,52,480.062286,566.430298,0.3,0.75,1.488896,0.0,-12.5,0.0,0.0,0.0,0.0,1.488896,0,0
3,3,1523894000.0,52,480.721039,568.209595,0.3,0.25,0.677458,0.0,-5.5,0.0,0.0,0.0,0.0,0.677458,0,0
4,4,1523894000.0,52,481.002228,568.240417,0.3,0.75,0.225063,0.0,-9.5,0.0,0.0,0.0,0.0,0.225063,0,0


In [4]:
ds.tail()

Unnamed: 0,id,stamp,seq,x,y,z,probability,relative_radial_velocity,relative_lateral_velocity,cross_section,distance_rms,angle_rms,radial_velocity_rms,is_cylindrical,absolute_radial_velocity,belongs_to_object,object_label
99112,126,1523894000.0,1032,678.717834,805.607727,0.3,0.999,-0.741797,0.0,-14.0,0.0,0.0,0.0,0.0,-1.836032,0,0
99113,127,1523894000.0,1032,677.080444,807.090637,0.3,0.75,-0.249768,0.0,-17.0,0.0,0.0,0.0,0.0,0.375898,0,0
99114,128,1523894000.0,1032,677.180908,806.095703,0.3,0.999,-0.499849,0.0,-23.5,0.0,0.0,0.0,0.0,-0.483745,0,0
99115,129,1523894000.0,1032,673.983521,807.501404,0.3,0.999,0.0,0.0,-8.5,0.0,0.0,0.0,0.0,2.112201,0,0
99116,130,1523894000.0,1032,674.780273,802.797729,0.3,0.25,0.249426,0.0,6.5,0.0,0.0,0.0,0.0,-0.123979,0,0


In [5]:
ds.shape

(99117, 17)

In [6]:
DROPS = [  # Single value columns.
    'z',
    'relative_lateral_velocity',
    'distance_rms',
    'angle_rms',
    'radial_velocity_rms',
    'is_cylindrical',
]

In [7]:
ds.drop(DROPS, axis=1, inplace=True)
ds.head()

Unnamed: 0,id,stamp,seq,x,y,probability,relative_radial_velocity,cross_section,absolute_radial_velocity,belongs_to_object,object_label
0,0,1523894000.0,52,476.396881,559.627075,0.999,0.0,-9.0,0.0,0,0
1,1,1523894000.0,52,479.438354,566.93103,0.25,1.100242,-3.5,1.100242,0,0
2,2,1523894000.0,52,480.062286,566.430298,0.75,1.488896,-12.5,1.488896,0,0
3,3,1523894000.0,52,480.721039,568.209595,0.25,0.677458,-5.5,0.677458,0,0
4,4,1523894000.0,52,481.002228,568.240417,0.75,0.225063,-9.5,0.225063,0,0


In [8]:
# label_counts = ds.object_label.value_counts()
# bad_labels = label_counts[label_counts < 10].index.tolist()
# for bad_label in bad_labels:
#     ds = ds[ds.object_label != bad_label]
# ds.object_label.value_counts()

## Что тут есть?

cross_section - можно думать, что это логарифм площади  
relative_lateral_velocity - скорость относительно радара, радиальная ее часть  
absolute_radial_velocity - скорость относительно мира

belongs_to_object - попала ли точка по нашему мненияю на машину

### Эвристика №1

![](./img/credo.jpg)

Машины большие и едут быстро

In [9]:
def big_cars_sol(ds):
    y_hat = np.zeros(len(ds.values))
    dummy_mask = ds.absolute_radial_velocity > 2
    y_hat[dummy_mask] = 1
    return y_hat.astype(int)

y_hat = big_cars_sol(ds)
print('precission', precission(y_hat, ds.belongs_to_object.values))
print('recall', recall(y_hat, ds.belongs_to_object.values))

precission 0.3082339135536867
recall 0.11514575874917805


### Эвристика №2

Машины отражают по несколько точек

In [10]:
def kkn2_sol(ds):
    dummy_mask = ds.absolute_radial_velocity > 2
    dummy_cars = ds[dummy_mask][['x', 'y']].values

    y_hat = np.zeros(len(ds.values))
    if dummy_cars.shape[0] >= 2:
        index = NearestNeighbors(metric='l2')
        index.fit(dummy_cars)
        d, _ = index.kneighbors(ds[['x', 'y']], n_neighbors=2)
        y_hat[(d < 2.0)[:, 0]] = 1

    return y_hat.astype(int)

y_hat = kkn2_sol(ds)
print('precission', precission(y_hat, ds.belongs_to_object.values))
print('recall', recall(y_hat, ds.belongs_to_object.values))

precission 0.1570003605335897
recall 0.9544823555198363


In [11]:
import matplotlib.pyplot as plt
from IPython import display
import time

In [12]:
# ax = plt.gca()
# for seq in sorted(ds.seq.unique()):
#     ax.clear()
#     scene = ds[ds.seq == seq]
#     plt.scatter(scene.x, scene.y, s=5, c=scene.relative_radial_velocity)
#     ax.set_xlim((450,490))
#     ax.set_ylim((555,580))
#     display.clear_output(wait=True)
#     display.display(plt.gcf())
#     time.sleep(0.01)

### Tree

In [223]:
stamp_order = ds.groupby('seq').apply(lambda x: x.stamp.max()).index.to_list()
assert stamp_order == sorted(stamp_order), 'Seqs ordered by number.'

In [224]:
def add_features(scene):
    index = NearestNeighbors(metric='l2', n_jobs=8)
    index.fit(scene[scene.absolute_radial_velocity > 2][['x', 'y']].values)
    scene = scene.copy()
    for radius in tqdm.tqdm((2, 5)):
        _, a = index.radius_neighbors(scene[['x', 'y']].values, radius=radius)
        a = np.array([len(e) for e in a])
        scene['rad_{}'.format(radius)] = a
    return scene

In [225]:
scenes = []
for seq in tqdm.tqdm(sorted(ds.seq.unique())):
    scene = ds[ds.seq == seq]
    scenes.append(scene)
n = len(scenes)
train = scenes[:int(0.8 * n)]
val = scenes[int(0.8 * n): int(0.85 * n)]
test = scenes[int(0.85 * n):]
assert len(train) + len(val) + len(test) == n
len(train), len(val), len(test)

100%|██████████| 981/981 [00:00<00:00, 1434.40it/s]


(784, 49, 148)

In [226]:
def to_ds(scenes):
    scene_list = []
    for scene in scenes:
        scene_list.append(scene.values)
    ds = np.concatenate(scene_list, axis=0)
    ds = pd.DataFrame(ds, columns=scenes[0].columns)
    ds = add_features(ds)
    ds.drop(['id', 'stamp', 'seq'], axis=1, inplace=True)
    return ds

In [227]:
train_ds = to_ds(train)
val_ds = to_ds(train + val).iloc[train_ds.shape[0] + 1:, :]
test_ds = to_ds(train + val + test).iloc[train_ds.shape[0] + val_ds.shape[0] + 1:, :]
train_ds.shape, val_ds.shape, test_ds.shape

100%|██████████| 2/2 [00:00<00:00,  3.44it/s]
100%|██████████| 2/2 [00:00<00:00,  3.81it/s]
100%|██████████| 2/2 [00:00<00:00,  3.00it/s]


((74496, 10), (4834, 10), (19786, 10))

In [236]:
LABEL = 'belongs_to_object'
LABEL_COLUMS = ['x', 'y', 'object_label', 'belongs_to_object']

In [252]:
import catboost

def learn(X_train, X_val, y_train, y_val):
    train_pool = catboost.Pool(X_train, y_train, cweight=[0.8, 0.2])
    clf = catboost.CatBoostClassifier(
        custom_loss=['AUC', 'Accuracy'],
        n_estimators=100,
        depth=8,
    )
    clf.fit(
        train_pool,
        early_stopping_rounds=10,
        use_best_model=True, 
        eval_set=(X_val.values, y_val.values),
        plot=True,
        verbose=False,
    )
    return clf

X_train = train_ds.drop(LABEL_COLUMS, axis=1)
y_train = train_ds[LABEL]


X_val = val_ds.drop(LABEL_COLUMS, axis=1)
y_val = val_ds[LABEL]

In [253]:
cls = learn(X_train, X_val, y_train, y_val)

CatBoostError: Length of weight=2 and length of data=74496 are different.

In [231]:
def cnt(y):
    return pd.Series(y).value_counts()

In [245]:
cnt(y_test)

0    16829
1     2957
dtype: int64

In [244]:
cnt(y_hat)

0    19481
1      305
dtype: int64

In [243]:
from sklearn.metrics import precision_recall_fscore_support
X_test = test_ds.drop(LABEL_COLUMS, axis=1)
y_test = test_ds[LABEL].values.astype(int)
y_hat = cls.predict(X_test).astype(int)
# y_hat = y_hat | kkn2_sol(test)
# y_hat = kkn2_sol(test)
# prec, recall, _, _ = precision_recall_fscore_support(y_test, y_hat, average='micro')
# prec, recall
print('precission', precission(y_hat, y_test))
print('recall', recall(y_hat, y_test))

precission 0.3344262295081967
recall 0.034494420020290836


In [235]:
for c, i in zip(X_test.columns, cls.feature_importances_):
    print(c, i)

x 14.6349302900808
y 22.96465100538107
probability 0.0
relative_radial_velocity 21.11788020214757
cross_section 1.6291053501005714
absolute_radial_velocity 5.232028743333452
rad_2 8.151956062831742
rad_5 26.269448346124804
