In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

# Preprocessing

In [None]:
# load train data
df_train = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")
print(df_train.shape)
df_train.head()

In [None]:
# label encoding
df_label = df_train.value_counts('cleaned_label').reset_index()
df_label.columns = ['label', 'count']
df_label['-count'] = -df_label['count']
df_label['label'] = df_label['label'].apply(lambda x: x.strip())
df_label = df_label.sort_values(['-count', 'label']).reset_index(drop=True)
df_label['target'] = np.arange(len(df_label))
print(df_label.shape)
df_label.head()

In [None]:
df_train['target'] = df_train['cleaned_label'].progress_apply(lambda x: 
    df_label['target'][df_label['label']==x.strip()].values[0]
)
df_train.head()

In [None]:
# aggregation
df_tmp = df_train.groupby('Id')['target'].agg(lambda x: np.array(x)).reset_index()
df_tmp.columns = ['Id', 'target']
df_train_agg = df_train[df_train['Id'].duplicated()==False].reset_index(drop=True)
df_train_agg = pd.merge(df_train_agg['Id'], df_tmp, on='Id')
df_train_agg['target'] = df_train_agg['target'].apply(lambda x: x.reshape(-1))
print(df_train_agg.shape)
df_train_agg.head()

# Metric implementaion

In [None]:
def calc_score(y_true, y_pred, beta=0.5):
    TP = 0
    FP = 0
    FN = 0
    for i in range(len(y_true)):
        y_true_i = y_true[i]
        y_pred_i = y_pred[i]
        for j in range(len(y_true_i)):
            if y_true_i[j] in y_pred_i:
                TP += 1
            else:
                FN += 1
        for j in range(len(y_pred_i)):
            if y_pred_i[j] not in y_true_i:
                FP += 1
    F_beta = (1+beta**2)*TP/((1+beta**2)*TP + beta**2*FP + FN)
    return F_beta

# Let's do math!

Let $T$ be the number of sample in the train data. Let $N$ be the number of positive labels in the train data. 
Then the average number of positive labels for each sample of the train data can be written as $N/T$.


In [None]:
T = len(df_train_agg)
N = len(df_train)
N_per_T = N/T
print("N/T: {:.6f}".format(N_per_T))

The metric of this competition is $F_\beta$, 
$$ F_{\beta}(P) = \frac{(1+\beta^2)TP}{(1+\beta^2)TP + \beta^2FP + FN}, \tag{1}$$
where $P$ is the prediction and $TP$, $FP$ and $FN$ is the number of true positive, false positive and false negative respectively.  
Let $n_0$ be the number of target 0 in the train data and let $P_0$ be the prediction that predicts all samples as 0. Then, 

$$ F_{\beta}(P_0) = \frac{(1+\beta^2)n_0}{(1+\beta^2)n_0 + \beta^2(T-n_0) + (N-n_0)} = \frac{(1+\beta^2)n_0}{N+\beta^2T } \tag{2}$$

As same as above, let $n_1$ be the number of target 1 in the train data and let $P_1$ be the prediction that predicts all samples as 1. Then,

$$ F_{\beta}(P_1) = \frac{(1+\beta^2)n_1}{N+\beta^2T} \tag{3}$$

Moreover, let $P_{0,1}$ be the prediction that predicts all samples as 0 and 1. Then,

$$ F_{\beta}(P_{0,1}) = \frac{(1+\beta^2)(n_0 + n_1)}{(1+\beta^2)(n_0+n_1) + \beta^2(T-n_0-n_1) + (2N-n_0-n_1)} = \frac{(1+\beta^2)(n_0+n_1)}{N+2\beta^2T} \tag{4}$$

$F_{\beta}(P_0)$, $F_{\beta}(P_1)$ and $F_{\beta}(P_{0,1}) $ are calcurated in the next cell.

In [None]:
# predict all data as 0
pred0 = np.ones([len(df_train_agg), 1])*0
F_beta0 = calc_score(df_train_agg['target'].values, pred0)
print("F_beta0 : {:.6f}".format(F_beta0))

# predict all data as 1
pred1 = np.ones([len(df_train_agg), 1])*1
F_beta1 = calc_score(df_train_agg['target'].values, pred1)
print("F_beta1 : {:.6f}".format(F_beta1))


# predict all data as [0,1]
pred01 = np.zeros([len(df_train_agg), 2])
pred01[:,1] = 1
F_beta01 = calc_score(df_train_agg['target'].values, pred01)
print("F_beta01: {:.6f}".format(F_beta01))

From the equation (2), (3) and (4) we have, 


$$ F_{\beta}(P_0)(N+\beta^2T) = (1+\beta^2)n_0 \tag{5}$$

$$ F_{\beta}(P_1)(N+\beta^2T) = (1+\beta^2)n_1 \tag{6}$$

$$ F_{\beta}(P_{0,1})(N+2\beta^2T) = (1+\beta^2)(n_0+n_1). \tag{7}$$

By equation (5) + (6), we have, 


$$ (F_{\beta}(P_0)+F_{\beta}(P_1))(N+\beta^2T) = (1+\beta^2)(n_0+n_1) \tag{8}$$

From equation (7) and (8), we have,

$$ (F_{\beta}(P_0)+F_{\beta}(P_1))(N+\beta^2T) = F_{\beta}(P_{0,1})(N+2\beta^2T) \tag{9}$$

By transforming (9), we have,


$$ \frac{N}{T} = \beta^2\frac{2F_{\beta}(P_{0,1})-F_{\beta}(P_0)-F_{\beta}(P_1)}{F_{\beta}(P_0)+F_{\beta}(P_1)-F_{\beta}(P_{0,1})} \tag{10}$$

OK. Using equation (10), we can calcurate $N/T$ with $F_{\beta}(P_0)$, $F_{\beta}(P_1)$ and $F_{\beta}(P_{0,1})$. Let's check it.


In [None]:
def calc_N_per_T(F_beta0, F_beta1, F_beta01, beta=0.5):
    return (beta**2)*(2*F_beta01-F_beta0-F_beta1)/(F_beta0+F_beta1-F_beta01)

tmp = calc_N_per_T(F_beta0, F_beta1, F_beta01)
print('estimated N/T: {:.6f}'.format(tmp))

Looks Fine! We can do same calcuration on the test data by submitting $P_0$, $P_1$ and $P_{0,1}$. I submitted them and got results below. 

In [None]:
F_beta0_test = 0.022
F_beta1_test = 0.014
F_beta01_test = 0.026

Let's calcurate $N/T$ of the test data!

In [None]:
tmp = calc_N_per_T(F_beta0_test, F_beta1_test, F_beta01_test)
print('estimated N/T of the test data: {:.6f}'.format(tmp))

Now, we have the average number of positive labels for each sample of the test data. But the result is surprising. It's much less than that of the train data. Is it correct? Or is there something wrong with my equation or implementation? If you find any problems with my code, please let me know.

# make submission with prediction of all [0, 1]

In [None]:
# make submisttion with P_0,1
df_sub = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv")
df_sub['PredictionString'] = "{}|{}".format(df_label['label'][0], df_label['label'][1])
df_sub.to_csv("submission.csv", index=None)
df_sub.head()