In [59]:
import json
import numpy as np
import pandas as pd
import mylib
import scml
from scml import nlp as snlp

In [60]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)

In [61]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train.sort_values("dataset_title", inplace=True, ignore_index=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19661 entries, 0 to 19660
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             19661 non-null  object
 1   pub_title      19661 non-null  object
 2   dataset_title  19661 non-null  object
 3   dataset_label  19661 non-null  object
 4   cleaned_label  19661 non-null  object
dtypes: object(5)
memory usage: 768.1+ KB


In [62]:
cols = ["pub_title", "dataset_title", "dataset_label"]
train[cols].head()

Unnamed: 0,pub_title,dataset_title,dataset_label
0,Depth varying rupture properties during the 2015 Mw 7.8 Gorkha (Nepal) earthquake,Advanced National Seismic System (ANSS) Comprehensive Catalog (ComCat),ANSS Comprehensive Catalog
1,Shaking from Injection-Induced Earthquakes in the Central and Eastern United States,Advanced National Seismic System (ANSS) Comprehensive Catalog (ComCat),Advanced National Seismic System Comprehensive Catalog
2,Resolving Teleseismic Earthquake Catalog and InSAR Data Discrepancies in Absolute Space to Explore Rupture Complexity Along the Ecuadorian Megathrust Fault,Advanced National Seismic System (ANSS) Comprehensive Catalog (ComCat),ANSS Comprehensive Catalog
3,Resolving Teleseismic Earthquake Catalog and InSAR Data Discrepancies in Absolute Space to Explore Rupture Complexity Along the Ecuadorian Megathrust Fault,Advanced National Seismic System (ANSS) Comprehensive Catalog (ComCat),Advanced National Seismic System (ANSS) Comprehensive Catalog (ComCat)
4,"Landslides and Megathrust Splay Faults Captured by the Late Holocene Sediment Record of Eastern Prince William Sound, Alaska",Advanced National Seismic System (ANSS) Comprehensive Catalog (ComCat),ANSS Comprehensive Catalog


In [63]:
train["dataset_title"].value_counts()

Alzheimer's Disease Neuroimaging Initiative (ADNI)                                             6144
Baltimore Longitudinal Study of Aging (BLSA)                                                   1589
Trends in International Mathematics and Science Study                                          1163
Early Childhood Longitudinal Study                                                             1011
SARS-CoV-2 genome sequence                                                                      860
Census of Agriculture                                                                           743
Education Longitudinal Study                                                                    676
Agricultural Resource Management Survey                                                         660
North American Breeding Bird Survey (BBS)                                                       585
National Education Longitudinal Study                                                           550


In [64]:
dict(train["cleaned_label"].value_counts())

{'adni': 3673,
 'alzheimer s disease neuroimaging initiative adni ': 2400,
 'trends in international mathematics and science study': 1163,
 'baltimore longitudinal study of aging': 1156,
 'early childhood longitudinal study': 1011,
 'education longitudinal study': 676,
 'census of agriculture': 643,
 'agricultural resource management survey': 623,
 'national education longitudinal study': 550,
 'rural urban continuum codes': 490,
 'baltimore longitudinal study of aging blsa ': 433,
 'survey of earned doctorates': 428,
 'north american breeding bird survey': 380,
 'world ocean database': 314,
 'slosh model': 304,
 'noaa tide gauge': 299,
 'survey of doctorate recipients': 298,
 'ibtracs': 280,
 'coastal change analysis program': 255,
 'common core of data': 252,
 'sars cov 2 genome sequences': 244,
 'beginning postsecondary students': 241,
 'genome sequence of sars cov 2': 222,
 'our world in data': 212,
 'baccalaureate and beyond': 199,
 'north american breeding bird survey bbs ': 198,

In [65]:
train["cleaned_label_len"] = train["cleaned_label"].str.len()
train["cleaned_label_len"].describe(percentiles=percentiles)

count    19661.000000
mean        29.939983
std         16.889102
min          4.000000
1%           4.000000
5%           4.000000
10%          4.000000
20%          7.000000
30%         21.000000
40%         27.000000
50%         31.000000
60%         37.000000
70%         39.000000
80%         49.000000
90%         49.000000
95%         53.000000
99%         62.000000
max        128.000000
Name: cleaned_label_len, dtype: float64

In [66]:
def cleaned_label_digits(row):
    return snlp.count_digit(row["cleaned_label"])

In [67]:
train["cleaned_label_digits"] = train.apply(cleaned_label_digits, axis=1)
train["cleaned_label_digits"].describe(percentiles=percentiles)

count    19661.000000
mean         0.086059
std          0.408605
min          0.000000
1%           0.000000
5%           0.000000
10%          0.000000
20%          0.000000
30%          0.000000
40%          0.000000
50%          0.000000
60%          0.000000
70%          0.000000
80%          0.000000
90%          0.000000
95%          1.000000
99%          2.000000
max          4.000000
Name: cleaned_label_digits, dtype: float64

In [71]:
train["cleaned_label_digits_frac"] = train["cleaned_label_digits"] / train["cleaned_label_len"]
train["cleaned_label_digits_frac"].describe(percentiles=percentiles)

count    19661.000000
mean         0.002921
std          0.013558
min          0.000000
1%           0.000000
5%           0.000000
10%          0.000000
20%          0.000000
30%          0.000000
40%          0.000000
50%          0.000000
60%          0.000000
70%          0.000000
80%          0.000000
90%          0.000000
95%          0.033333
99%          0.066667
max          0.160000
Name: cleaned_label_digits_frac, dtype: float64

In [73]:
%%time
s = train.groupby("Id")["dataset_title"].count()
s.describe(percentiles=percentiles)

CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 15.8 ms


count    14316.000000
mean         1.373358
std          0.647486
min          1.000000
1%           1.000000
5%           1.000000
10%          1.000000
20%          1.000000
30%          1.000000
40%          1.000000
50%          1.000000
60%          1.000000
70%          2.000000
80%          2.000000
90%          2.000000
95%          2.000000
99%          3.000000
max         22.000000
Name: dataset_title, dtype: float64

In [68]:
%%time
df = train[train["cleaned_label"].str.contains("|", regex=False)]
assert len(df) == 0

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.83 ms


In [69]:
def jaccard_score(row):
    return mylib.jaccard(row["dataset_title"], row["cleaned_label"])

In [70]:
train["jaccard"] = train.apply(jaccard_score, axis=1)
train["matched"] = np.where(train["jaccard"] >= 0.5, 1, 0)
train["matched"].value_counts(normalize=True)

1    0.560297
0    0.439703
Name: matched, dtype: float64