In [77]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

from IPython.display import display, Image
from pigeon import annotate

from collections import defaultdict
from pathlib import Path
from typing import List
from matplotlib import pyplot as plt

In [97]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [98]:
ROOT: Path = Path("../")
CSV_FILE: Path = ROOT / "results" / "people_data_manual_labels_sports.csv"
FEATURES: Path = ROOT / "data" / "features" / "visual_genome" / "images" / "clip" / "VG_100K"
SEED: int = 892892

In [99]:
df: pd.DataFrame = pd.read_csv(CSV_FILE)
df.set_index("image_id", inplace=True)
df.head()

Unnamed: 0_level_0,is_corrupted,contains_person,used_people_detector,is_sport_manual_label
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2320642,0,1,1,0
2356706,0,1,0,0
2383721,0,1,0,1
2342419,0,1,0,0
2338896,0,1,1,0


In [100]:
df_unlabelled: pd.DataFrame = pd.read_csv(
    ROOT / "data" / "visual_genome" / "processed" / "people_data.csv"
)
df_unlabelled.set_index("image_id", inplace=True)
df_unlabelled = df_unlabelled[df_unlabelled.contains_person == 1]
df_unlabelled.drop(df.index, inplace=True)

df_unlabelled.head()

Unnamed: 0_level_0,is_corrupted,contains_person,used_people_detector,is_sport_hf_vqa,is_sport_hf_vqa_conf,is_sport_blip,is_work_blip,is_work_hf_vqa,is_work_hf_vqa_conf
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,1,0,0,0.997345,no,no,0,0.99165
2,0,1,0,0,0.988688,no,no,0,0.905619
3,0,1,0,0,0.984434,no,yes,1,0.995035
7,0,1,0,0,0.987285,no,yes,1,0.990308
8,0,1,0,0,0.973007,no,yes,1,0.999335


In [103]:
train, test, y_train, y_test = train_test_split(df, df.is_sport_manual_label, test_size=0.5, random_state=SEED, stratify=df.is_sport_manual_label)

In [104]:
y_test.value_counts()

is_sport_manual_label
0    344
1    156
Name: count, dtype: int64

In [105]:
def load_space(df: pd.DataFrame) -> np.array:
    embeddings: List[np.array] = []
    for i in df.index:
        emb_path: Path = FEATURES/ f"{i}.npy"
        emb: np.array = np.load(emb_path)
        embeddings.append(emb)
    return np.stack(embeddings)

In [106]:
train_space: np.array = load_space(train)
test_space: np.array = load_space(test)
unlabelled_space: np.array = load_space(df_unlabelled)
train_space.shape, test_space.shape, unlabelled_space.shape

((500, 512), (500, 512), (57391, 512))

In [107]:
# 1. train model
model = LogisticRegression(random_state=SEED)
model.fit(train_space, y_train)
y_pred = model.predict(test_space)

In [108]:
y_test.values

array([1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,

In [109]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if (y_actual[i]==1) and (y_hat[i]==1):
           TP += 1
        if (y_hat[i]==1) and (y_actual[i]!=y_hat[i]):
           FP += 1
        if (y_actual[i]==0) and (y_hat[i]==0):
           TN += 1
        if (y_hat[i]==0) and (y_actual[i]!=y_hat[i]):
           FN += 1

    return(TP, FP, TN, FN)

TP, FP, TN, FN = perf_measure(y_test.values, y_pred)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print(f"Accuracy: {ACC}, FPR: {FPR}")
### IF ACCURACY OR FPR IS NOT SATISFACTORY, GO TO STEP 3 ###

Accuracy: 0.966, FPR: 0.020348837209302327


In [96]:
TNR + FPR

1.0

In [None]:
# 3. Here goes the active learning code

In [None]:
# 4. labelling


### GO BACK TO STEP 1 ###

In [110]:
((model.predict(unlabelled_space) == 1).sum() + (y_train == 1).sum() + (y_test == 1).sum()) / (len(unlabelled_space) + len(y_train) + len(y_test))

0.3160932335462657

In [72]:
(y_train == 1).sum() + (y_test == 1).sum() / 1000

37.036

In [73]:
cc = (model.predict(unlabelled_space) == 1).sum() / len(unlabelled_space)

In [75]:
(cc - FPR) / (TPR - FPR)

0.03364961727771227