In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

# The description below is completely wrong. Please wait for revision.

# Preprocessing

In [None]:
# load train data
df_train = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")
print(df_train.shape)
df_train.head()

In [None]:
# label encoding
df_label = df_train.value_counts('cleaned_label').reset_index()
df_label.columns = ['label', 'count']
df_label['-count'] = -df_label['count']
df_label['label'] = df_label['label'].apply(lambda x: x.strip())
df_label = df_label.sort_values(['-count', 'label']).reset_index(drop=True)
df_label['target'] = np.arange(len(df_label))
print(df_label.shape)
df_label.head()

In [None]:
df_train['target'] = df_train['cleaned_label'].progress_apply(lambda x: 
    df_label['target'][df_label['label']==x.strip()].values[0]
)
df_train.head()

In [None]:
# aggregation
df_tmp = df_train.groupby('Id')['target'].agg(lambda x: np.array(x)).reset_index()
df_tmp.columns = ['Id', 'target']
df_train_agg = df_train[df_train['Id'].duplicated()==False].reset_index(drop=True)
df_train_agg = pd.merge(df_train_agg['Id'], df_tmp, on='Id')
df_train_agg['target'] = df_train_agg['target'].apply(lambda x: x.reshape(-1))
print(df_train_agg.shape)
df_train_agg.head()

# Metric implementaion

In [None]:
def calc_score(y_true, y_pred, beta=0.5):
    TP = 0
    FP = 0
    FN = 0
    for i in range(len(y_true)):
        y_true_i = y_true[i]
        y_pred_i = y_pred[i]
        FP += len(y_pred_i)
        for j in range(len(y_true_i)):
            if y_true_i[j] in y_pred_i:
                TP += 1
                FP -= 1
            else:
                FN += 1
    F_beta = (1+beta**2)*TP/((1+beta**2)*TP + beta**2*FP + FN)
    return F_beta

# Let's do math!

Let $T$ be the number of sample in the train data. Let $N$ be the number of positive labels in the train data. 
Then the average number of positive labels for each sample of the train data can be written as $N/T$.


In [None]:
T = len(df_train_agg)
N = len(df_train)
N_per_T = N/T
print("N/T: {:.6f}".format(N_per_T))

The metric of this competition is $F_\beta$, 
$$ F_{\beta}(P) = \frac{(1+\beta^2)TP}{(1+\beta^2)TP + \beta^2FP + FN}, \tag{1}$$
where $P$ is the prediction and $TP$, $FP$ and $FN$ is the number of true positive, false positive and false negative respectively.  
Let $n_0$ be the number of target 0 in the train data and let $P_0$ be the prediction that predicts all samples as 0. Then, 

$$ F_{\beta}(P_0) = \frac{(1+\beta^2)n_0}{(1+\beta^2)n_0 + \beta^2(T-n_0) + (N-n_0)} = \frac{(1+\beta^2)n_0}{N+\beta^2T } \tag{2}$$

As same as above, let $n_1$ be the number of target 1 in the train data and let $P_1$ be the prediction that predicts all samples as 1. Then,

$$ F_{\beta}(P_1) = \frac{(1+\beta^2)n_1}{N+\beta^2T} \tag{3}$$

Moreover, let $P_{0,1}$ be the prediction that predicts all samples as 0 and 1. Then,

$$ F_{\beta}(P_{0,1}) = \frac{(1+\beta^2)(n_0 + n_1)}{(1+\beta^2)(n_0+n_1) + \beta^2(T-n_0-n_1) + (2N-n_0-n_1)} = \frac{(1+\beta^2)(n_0+n_1)}{N+2\beta^2T} \tag{4}$$

$F_{\beta}(P_0)$, $F_{\beta}(P_1)$ and $F_{\beta}(P_{0,1}) $ are calcurated in the next cell.

In [None]:
# predict all data as 0
pred0 = np.ones([len(df_train_agg), 1])*0
F_beta0 = calc_score(df_train_agg['target'].values, pred0)
print("F_beta0 : {:.6f}".format(F_beta0))

# predict all data as 1
pred1 = np.ones([len(df_train_agg), 1])*1
F_beta1 = calc_score(df_train_agg['target'].values, pred1)
print("F_beta1 : {:.6f}".format(F_beta1))


# predict all data as [0,1]
pred01 = np.zeros([len(df_train_agg), 2])
pred01[:,1] = 1
F_beta01 = calc_score(df_train_agg['target'].values, pred01)
print("F_beta01: {:.6f}".format(F_beta01))

From the equation (2), (3) and (4) we have, 


$$ F_{\beta}(P_0)(N+\beta^2T) = (1+\beta^2)n_0 \tag{5}$$

$$ F_{\beta}(P_1)(N+\beta^2T) = (1+\beta^2)n_1 \tag{6}$$

$$ F_{\beta}(P_{0,1})(N+2\beta^2T) = (1+\beta^2)(n_0+n_1). \tag{7}$$

By equation (5) + (6), we have, 


$$ (F_{\beta}(P_0)+F_{\beta}(P_1))(N+\beta^2T) = (1+\beta^2)(n_0+n_1) \tag{8}$$

From equation (7) and (8), we have,

$$ (F_{\beta}(P_0)+F_{\beta}(P_1))(N+\beta^2T) = F_{\beta}(P_{0,1})(N+2\beta^2T) \tag{9}$$

By transforming (9), we have,


$$ \frac{N}{T} = \beta^2\frac{2F_{\beta}(P_{0,1})-F_{\beta}(P_0)-F_{\beta}(P_1)}{F_{\beta}(P_0)+F_{\beta}(P_1)-F_{\beta}(P_{0,1})} \tag{10}$$

OK. Using equation (10), we can calcurate $N/T$ with $F_{\beta}(P_0)$, $F_{\beta}(P_1)$ and $F_{\beta}(P_{0,1})$. Let's check it.


In [None]:
def calc_N_per_T(F_beta0, F_beta1, F_beta01, beta=0.5):
    return (beta**2)*(2*F_beta01-F_beta0-F_beta1)/(F_beta0+F_beta1-F_beta01)

tmp = calc_N_per_T(F_beta0, F_beta1, F_beta01)
print('estimated N/T: {:.6f}'.format(tmp))

Looks Fine! We can do same calcuration on the test data by submitting $P_0$, $P_1$ and $P_{0,1}$. I submitted them and got results below. 

In [None]:
F_beta0_test = 0.022 # submission 1
F_beta1_test = 0.014 # submission 2
F_beta01_test = 0.026 # submission 3

Let's calcurate $N/T$ of the test data!

In [None]:
N_per_T_predicted = calc_N_per_T(F_beta0_test, F_beta1_test, F_beta01_test)
print('estimated N/T of the test data: {:.6f}'.format(N_per_T_predicted))

Now, we have the average number of positive labels for each sample of the test data. But the result is surprising. It's much less than that of the train data.  
Anyway, let's culculate $T$ and $N$! $T$ of private is about 8000 as discribed [here](https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/data) and $T$ of public : $T$ of private is 12:88 as discribed [here](https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/leaderboard). So that, 

In [None]:
T_public_predicted = 8000 * 12/88
N_public_predicted = N_per_T_predicted * T_public_predicted
print('estimated T of the public test data: {:.6f}'.format(T_public_predicted))
print('estimated N of the public test data: {:.6f}'.format(N_public_predicted))

# Another approach
As described [here](https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/data), public test data includes train data. Therefore, we can make submission only with True positive labels. I made a submisstion with 200 true positive labels and submitted it. The score is 

In [None]:
F_beta_200TP = 0.164 # submission 4

$F_{\beta}(P_{200TP})$ can be written as 

$$ F_{\beta}(P_{200TP}) = \frac{(1+\beta^2)200}{(1+\beta^2)200 + (N-200)} \tag{11}$$

By transforming (11), we have,


$$ N = \frac{200}{F_{\beta}(P_{200TP})}(1+\beta^2-\beta^2F_{\beta}(P_{200TP})) \tag{12}$$

OK. We've got another equation to get $N$. Let's calculate this!

In [None]:
beta = 0.5
N_public_predicted2 = 200/F_beta_200TP*(1+beta**2-F_beta_200TP*beta**2)
print('estimated N of the public test data: {:.6f}'.format(N_public_predicted2))

Oops, the estimated $N$ is very different from the one above! Assuming that my calculation is correct, these result suggest that $\beta$ of Leaderboard scoring is wrong. Actually, if $\beta = 1.2$ all the results becomes plausible like this.

In [None]:
beta = 1.2
N_per_T_predicted = calc_N_per_T(F_beta0_test, F_beta1_test, F_beta01_test, beta=beta)
N_public_predicted = N_per_T_predicted * T_public_predicted
N_public_predicted2 = 200/F_beta_200TP*(1+beta**2-F_beta_200TP*beta**2)
print('estimated N/T of the test data: {:.6f}'.format(N_per_T_predicted))
print('estimated N of the public test data: {:.6f}'.format(N_public_predicted))
print('estimated N of the public test data (ver 2): {:.6f}'.format(N_public_predicted2))

This conclusion well explains our situation. If really $\beta = 0.5$, it must be so easy to make better submission than the string-matching baseline model because the false negative labels are lesser penalized. If $\beta$ is bigger, the FNs wll heavily penalized and the model exploring the unseen datasets can not beat the baseline.

# make submissions discribed above

In [None]:
# make submisttion 1
df_sub1 = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv")
df_sub1['PredictionString'] = df_label['label'][0]
df_sub1.to_csv("submission.csv", index=None)
df_sub1.head()

In [None]:
# make submisttion 2
df_sub2 = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv")
df_sub2['PredictionString'] = df_label['label'][1]
df_sub2.to_csv("submission.csv", index=None)
df_sub2.head()

In [None]:
# make submission 3
df_sub = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv")
df_sub['PredictionString'] = "{}|{}".format(df_label['label'][0], df_label['label'][1])
df_sub.to_csv("submission.csv", index=None)
df_sub.head()

In [None]:
# # make submission 4
# # aggregate train data
# df_train_reduced = df_train[df_train['Id'].duplicated()==False].reset_index(drop=True)
# df_train_reduced.head()

In [None]:
# # aggregate labels
# def agg_label(x):
#     labels = df_train['cleaned_label'][df_train['Id']==x].values
#     labels = np.sort(np.unique(labels))
#     labels_str = []
#     labels_str.append('|'.join(labels))
#     labels_str = labels_str[0]
#     return labels_str

# df_train_reduced['cleaned_labels'] = df_train_reduced['Id'].progress_apply(lambda x: agg_label(x))
# df_train_reduced.head()

In [None]:
# # label encoding
# def get_target(x):
#     ans = []
#     x_list = x.split('|')
#     for i, item in enumerate(x_list):
#         ans.append(df_label['target'][df_label['label']==item.strip()].values[0])
#     return ans

# df_train_reduced['targets'] = df_train_reduced['cleaned_labels'].progress_apply(get_target)
# df_train_reduced.head()

In [None]:
# # load text
# import os, json

# train_files_path = "../input/coleridgeinitiative-show-us-the-data/train"

# def read_append_return(filename, train_files_path=train_files_path, output='text'):
#     """
#     Function to read json file and then return the text data from them and append to the dataframe
#     """
#     json_path = os.path.join(train_files_path, (filename+'.json'))
#     headings = []
#     contents = []
#     combined = []
#     with open(json_path, 'r') as f:
#         json_decode = json.load(f)
#         for data in json_decode:
#             headings.append(data.get('section_title'))
#             contents.append(data.get('text'))
#             combined.append(data.get('section_title'))
#             combined.append(data.get('text'))
    
#     all_headings = ' '.join(headings)
#     all_contents = ' '.join(contents)
#     all_data = '. '.join(combined)
    
#     if output == 'text':
#         return all_contents
#     elif output == 'head':
#         return all_headings
#     else:
#         return all_data
    

# df_train_reduced['text'] = df_train_reduced['Id'].progress_apply(lambda x: read_append_return(x))

In [None]:
# # load test data and test text
# df_test = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv")
# test_files_path = "../input/coleridgeinitiative-show-us-the-data/test"
# df_test['text'] = df_test['Id'].progress_apply(lambda x: read_append_return(x, train_files_path=test_files_path))
# df_test.head()

In [None]:
# # find train data in test data
# def detect_duplicated(x):
#     for i in range(len(df_train_reduced)):
#         if x==df_train_reduced['text'][i]:
#             return df_train_reduced['Id'][i]
#     return 'no dup'

# df_test['dup_id'] = df_test['text'].progress_apply(lambda x: detect_duplicated(x))
# df_test['dup'] = df_test['dup_id']!='no dup'
# df_test.head()

In [None]:
# # use true label of train data
# def pred_dup(x):
#     df_tmp = df_train_reduced[df_train_reduced['text']==x]
#     if len(df_tmp)>0:
#         label_list = df_tmp['targets'].values[0]
#     else:
#         label_list = np.zeros(0, np.int64)
#     return label_list

# df_test['pred_dup'] = df_test['text'].apply(lambda x: pred_dup(x))
# df_test.head()

In [None]:
# # reduce true label to 200
# REDUCE_THRESHOLD = 200
# count = 0
# new_preds = np.zeros([len(df_test), 0]).tolist()
# # df_test['pred_det_rduced'] = df_test['pred_det_reduced'].apply(lambda x: [])
# for i in range(len(df_test)):
#     if count>=REDUCE_THRESHOLD: break
#     tmp_pred = list(df_test['pred_dup'][i])
#     new_pred = []
#     for j in range(len(tmp_pred)):
#         if count>=REDUCE_THRESHOLD: break
#         new_pred.append(tmp_pred[j])
#         count += 1
#     new_preds[i] = new_pred
# # 
# print(new_preds)
# print(count)
# df_test['pred_dup_reduced'] = new_preds
# df_test.head()

In [None]:
# # decode label encoding
# def get_label(x, ref_label, ref_target):
#     predict = []
#     for i in range(len(x)):
#         predict.append(ref_label[ref_target==x[i]][0])
#     predict = np.unique(predict).tolist()
#     tmp_list = []
#     tmp_list.append('|'.join(predict))
#     return tmp_list[0]

# df_test['pred_dup_str'] = df_test['pred_dup_reduced'].progress_apply(lambda x: 
#         get_label(x, df_label['label'].values, df_label['target'].values)
# )
# df_sub4 = df_test[['Id', 'pred_dup_str']]
# df_sub4.columns = ['Id', 'PredictionString']
# df_sub4.to_csv("submission.csv", index=None)
# df_sub4.head()