# Prep sentences
Preparing training data for classification by extracting features

**Target (y):**
- [x] has dataset reference

**Features (X:**
- [x] has indicator terms
- [x] in section
- [x] has acronym

In [19]:
import glob
import json
import re
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize

from random import randint

In [20]:
path = '../data/'
DATA_DIR = '/nfs/turbo/hrg/coleridge/'

In [21]:
def clean_text(txt):
    """
    Convert to lowercase, remove special characters, and punctuation.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

def find_acronyms(txt):
    """
    finds and returns a sequence of capital letters
    for use on dataset_titles, dataset_labels, or full text
    """
    matches = re.findall(r"\b[A-Z\.]{2,}s?\b", txt)
    if matches:
        return 1
    else:
        return 0

def count_acronyms(txt):
    """
    finds and returns a sequence of capital letters
    for use on dataset_titles, dataset_labels, or full text
    """
    matches = re.findall(r"\b[A-Z\.]{2,}s?\b", txt)
    if matches:
        return len(matches)
    else:
        return 0

# def find_ents(txt):
#     """
#     loads our custom model to determine if a sentence has a dataset reference, binary
#     """
#     doc = custom_ner_model(txt)
#     if len(doc.ents) > 0:
#         return 1
#     else:
#         return 0

# def count_ents(txt):
#     """
#     loads our custom model to determine if a sentence has a dataset reference, counts number of times entities appear
#     """
#     label = []
#     doc = custom_ner_model(txt)
#     if len(doc.ents) > 0:
#         for entity in doc:
#             label.append(doc.ents)
#         return len(label)
#     else:
#         return 0

## Data

In [4]:
submission_df = pd.read_csv('../data/sample_submission.csv')

In [5]:
df_train = pd.read_csv('../data/train.csv')

In [6]:
train_files = glob.glob('../data/train/*.json')

df_train_pubs = pd.DataFrame()
for train_file in train_files: 
    file_data = pd.read_json(train_file)
    file_data.insert(0,'Id', train_file.split('/')[-1].split('.')[0])
    df_train_pubs = pd.concat([df_train_pubs, file_data])

df_train_pubs['clean_text'] = df_train_pubs['text'].apply(clean_text)
df_train_pubs.head()

Unnamed: 0,Id,section_title,text,clean_text
0,07cbcedc-9f95-42e3-8340-468a866916b9,Abstract,"In this study, we highlight the importance of ...",in this study we highlight the importance of s...
1,07cbcedc-9f95-42e3-8340-468a866916b9,,consequences of these early work experiences a...,consequences of these early work experiences a...
2,07cbcedc-9f95-42e3-8340-468a866916b9,Socioeconomic Disadvantage and Early Work Expe...,Some scholars have suggested that long work ho...,some scholars have suggested that long work ho...
3,07cbcedc-9f95-42e3-8340-468a866916b9,The Youth Development Study,"To address these issues, we draw on data from ...",to address these issues we draw on data from t...
4,07cbcedc-9f95-42e3-8340-468a866916b9,Teenage Work and the Process of Socioeconomic ...,Precursors of Teenage Work. We first distingui...,precursors of teenage work we first distinguis...


In [7]:
df_full = df_train.merge(df_train_pubs, on='Id')
df_full = df_full.drop(columns=['pub_title','dataset_title','cleaned_label','clean_text'])

train_sentences = []

for row in df_full.itertuples():
    for sent in sent_tokenize(row[4]):
        train_sentences.append((row[1], row[2], row[3], sent))
        
df_train_sent = pd.DataFrame(train_sentences, columns=['Id', 'dataset_label', 'section_title', 'sent'])

df_train_sent['dataset_label'] = df_train_sent['dataset_label'].astype(str)
df_train_sent['sent'] = df_train_sent['sent'].astype(str)
df_train_sent['section_title'] = df_train_sent['section_title'].astype(str)

## Target
- consider adding a separate col for `match_count` if we want to predict continuous (number of times dataset is mentioned) rather than categorical (0, 1)

In [8]:
df_train_sent['match'] = df_train_sent.apply(lambda x: x.dataset_label in x.sent, axis=1)
# df_train_sent['match_count'] = df_train_sent.apply(lambda x: sum(1 if x.dataset_label in x.sent))
df_train_sent['match'].value_counts()

False    6734263
True       54804
Name: match, dtype: int64

In [9]:
df_train_sent['sent_clean'] = df_train_sent['sent'].apply(clean_text)
df_train_sent['section_clean']= df_train_sent['section_title'].apply(clean_text)

df_train_sent

Unnamed: 0,Id,dataset_label,section_title,sent,match,sent_clean,section_clean
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,National Education Longitudinal Study,What is this study about?,This study used data from the National Educati...,True,this study used data from the national educati...,what is this study about
1,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,National Education Longitudinal Study,What is this study about?,The study also reported whether the impacts of...,False,the study also reported whether the impacts of...,what is this study about
2,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,National Education Longitudinal Study,What is this study about?,"In addition, a supplemental analysis reports o...",False,in addition a supplemental analysis reports on...,what is this study about
3,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,National Education Longitudinal Study,What is this study about?,Dual enrollment programs offer college-level l...,False,dual enrollment programs offer college level l...,what is this study about
4,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,National Education Longitudinal Study,What is this study about?,The programs offer college courses and/or the ...,False,the programs offer college courses and or the ...,what is this study about
...,...,...,...,...,...,...,...
6789062,fd23e7e0-a5d2-4f98-992d-9209c85153bb,CAS COVID-19 antiviral candidate compounds data,Summary and conclusions,As a consequence of the current COVID-19 pande...,False,as a consequence of the current covid 19 pande...,summary and conclusions
6789063,fd23e7e0-a5d2-4f98-992d-9209c85153bb,CAS COVID-19 antiviral candidate compounds data,Summary and conclusions,"As a side effect of analyzing the data, we are...",False,as a side effect of analyzing the data we are ...,summary and conclusions
6789064,fd23e7e0-a5d2-4f98-992d-9209c85153bb,CAS COVID-19 antiviral candidate compounds data,Summary and conclusions,"In addition, the workflow has been used to det...",False,in addition the workflow has been used to dete...,summary and conclusions
6789065,fd23e7e0-a5d2-4f98-992d-9209c85153bb,CAS COVID-19 antiviral candidate compounds data,Summary and conclusions,The material has been used successfully for te...,False,the material has been used successfully for te...,summary and conclusions


## Features

### has Indicator terms

In [10]:
df_train_sent['freqData'] = df_train_sent['sent_clean'].str.count('data')
df_train_sent['freqEdu'] = df_train_sent['sent_clean'].str.count('edu')
df_train_sent['freqSample'] = df_train_sent['sent_clean'].str.count('sample')
df_train_sent['freqNational'] = df_train_sent['sent_clean'].str.count('national')
df_train_sent['freqSurvey'] = df_train_sent['sent_clean'].str.count('survey')
df_train_sent['freqPublic'] = df_train_sent['sent_clean'].str.count('public')
df_train_sent['freqAvail'] = df_train_sent['sent_clean'].str.count('avail')
df_train_sent['freqNSF'] = df_train_sent['sent_clean'].str.count('nsf')
df_train_sent['freqGov'] = df_train_sent['sent_clean'].str.count('gov')
df_train_sent['freqAccess'] = df_train_sent['sent_clean'].str.count('access')

df_train_sent.sample(n=10)

Unnamed: 0,Id,dataset_label,section_title,sent,match,sent_clean,section_clean,freqData,freqEdu,freqSample,freqNational,freqSurvey,freqPublic,freqAvail,freqNSF,freqGov,freqAccess
3146495,b262c60b-5b5f-4765-93bd-fb88458b4d7f,Beginning Postsecondary Student,Commissioner's Statement,As various groups voice their desires and conc...,False,as various groups voice their desires and conc...,commissioner s statement,0,0,0,0,0,0,0,0,0,0
1145560,773a390a-8b92-40a6-931d-c78d76d8f980,Alzheimer's Disease Neuroimaging Initiative (A...,Discussion,Because much literature still benchmarks AD co...,False,because much literature still benchmarks ad co...,discussion,0,0,0,0,0,0,0,0,0,0
1761877,ce6ef969-f018-4360-a709-6c6e093ee2f4,ADNI,Background,"Therefore, it would be reasonable to determine...",False,therefore it would be reasonable to determine ...,background,0,0,0,0,0,0,0,0,0,0
461686,9ad8bb80-6760-4e93-8583-71adf24ff721,NOAA Tide Gauge,Tidal Zone Determination,Subtidal areas were pixels with a probability ...,False,subtidal areas were pixels with a probability ...,tidal zone determination,0,0,0,0,0,0,0,0,0,0
2070288,25692b4d-0218-4e86-a5d3-e5ff725e4e67,ADNI,Introduction,Autoantibodies against a variety of molecules ...,False,autoantibodies against a variety of molecules ...,introduction,0,0,0,0,0,0,0,0,0,0
3791492,de3e7ef1-5cf8-4e50-882a-1493e3c5ff3d,Education Longitudinal Study,Domain Coverage,"An attempt was made to vary the content, conte...",False,an attempt was made to vary the content contex...,domain coverage,0,0,0,0,0,0,0,0,0,0
5382926,bc66d792-89b7-4fb5-9822-b1747ae031e9,Education Longitudinal Study,Antecedents to Underemployment,Low income (or working poor) includes full-tim...,False,low income or working poor includes full time ...,antecedents to underemployment,0,0,0,0,0,0,0,0,0,0
4616493,170113f9-399c-489e-ab53-2faf5c64c5bc,Survey of Science and Engineering Research Fac...,,National Research Council (NRC).,False,national research council nrc,,0,0,0,1,0,0,0,0,0,0
1073629,56634038-aabf-430f-bbd7-7b6cd1cd8d85,Alzheimer's Disease Neuroimaging Initiative (A...,Approach to case selection,No covariate adjustments are applied to the ra...,False,no covariate adjustments are applied to the ra...,approach to case selection,1,0,0,0,0,0,0,0,0,0
4435765,430aa11c-0283-411b-8edc-08f5df3db258,Survey of Graduate Students and Postdoctorates...,,Part-time employment excludes those employed p...,False,part time employment excludes those employed p...,,0,0,0,0,0,0,0,0,0,0


In [11]:
df_train_sent['hasData'] = np.where(df_train_sent['sent_clean'].str.contains('data'), 1, 0)
df_train_sent['hasEdu'] = np.where(df_train_sent['sent_clean'].str.contains('edu'), 1, 0)
df_train_sent['hasSample'] = np.where(df_train_sent['sent_clean'].str.contains('sample'), 1, 0)
df_train_sent['hasNational'] = np.where(df_train_sent['sent_clean'].str.contains('national'), 1, 0)
df_train_sent['hasSurvey'] = np.where(df_train_sent['sent_clean'].str.contains('survey'), 1, 0)
df_train_sent['hasPublic'] = np.where(df_train_sent['sent_clean'].str.contains('public'), 1, 0)
df_train_sent['hasAvail'] = np.where(df_train_sent['sent_clean'].str.contains('survey'), 1, 0)
df_train_sent['hasNSF'] = np.where(df_train_sent['sent_clean'].str.contains('nsf'), 1, 0)
df_train_sent['hasGov'] = np.where(df_train_sent['sent_clean'].str.contains('gov'), 1, 0)
df_train_sent['hasAccess'] = np.where(df_train_sent['sent_clean'].str.contains('access'), 1, 0)

### in Section

In [12]:
df_train_sent['inIntro'] = np.where(df_train_sent['section_clean'].str.contains('intro'), 1, 0)
df_train_sent['inDisc'] = np.where(df_train_sent['section_clean'].str.contains('discus'), 1, 0)
df_train_sent['inAbst'] = np.where(df_train_sent['section_clean'].str.contains('abstr'), 1, 0)
df_train_sent['inResult'] = np.where(df_train_sent['section_clean'].str.contains('resul'), 1, 0)
df_train_sent['inConcl'] = np.where(df_train_sent['section_clean'].str.contains('conclu'), 1, 0)
df_train_sent['inMethod'] = np.where(df_train_sent['section_clean'].str.contains('meth'), 1, 0)
df_train_sent['inBack'] = np.where(df_train_sent['section_clean'].str.contains('back'), 1, 0)
df_train_sent['inData'] = np.where(df_train_sent['section_clean'].str.contains('data'), 1, 0)
df_train_sent['inSumm'] = np.where(df_train_sent['section_clean'].str.contains('summ'), 1, 0)
df_train_sent['inAckno'] = np.where(df_train_sent['section_clean'].str.contains('acknowl'), 1, 0)

df_train_sent.sample(n=10)

Unnamed: 0,Id,dataset_label,section_title,sent,match,sent_clean,section_clean,freqData,freqEdu,freqSample,...,inIntro,inDisc,inAbst,inResult,inConcl,inMethod,inBack,inData,inSumm,inAckno
6232258,ad9e5140-f5d5-4711-a917-f526321b6425,Census of Agriculture,Estimation of the Tillage Adoption Model,"Regarding the first factor, one would expect a...",False,regarding the first factor one would expect a ...,estimation of the tillage adoption model,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1809065,c543b483-1bd0-44b2-843f-b9da9adcce8b,ADNI,Progression to AD and non-AD dementia,"Within the non-AD dementia cohort, 30 (16% of ...",False,within the non ad dementia cohort 30 16 of all...,progression to ad and non ad dementia,0,0,0,...,0,0,0,0,0,0,0,0,0,0
431033,36858562-8e80-4ce8-816b-7b5daca034d9,Baltimore Longitudinal Study of Aging,"Adiposity, Lipid Abnormalities, and Glucose Me...",It appears that testosterone levels are involv...,False,it appears that testosterone levels are involv...,adiposity lipid abnormalities and glucose meta...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
475354,c4f3fe76-aac2-4c6d-9a09-3f762bf16fea,NOAA Tide Gauge,b. Storm surge and inundation modeling and tid...,"At the same time, the mesh covers a large geog...",False,at the same time the mesh covers a large geogr...,b storm surge and inundation modeling and tida...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4353648,dd6df078-2010-42fa-ad30-9c302d2ba55b,Education Longitudinal Study,National Study of Postsecondary Faculty,The second cycle of NSOPF (NSOPF:93) was condu...,False,the second cycle of nsopf nsopf 93 was conduct...,national study of postsecondary faculty,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5018868,34f25eee-4aaf-4e67-873d-f34fd1fc5d8d,Baccalaureate and Beyond Longitudinal Study,Chapter 1. Why the Pay Gap Matters,"As described in Chapters 3 and 4, mothers pay ...",False,as described in chapters 3 and 4 mothers pay a...,chapter 1 why the pay gap matters,0,1,0,...,0,0,0,0,0,0,0,0,0,0
263620,f59da1b0-bb75-41ea-8492-6b1ebca2548d,Alzheimers Disease Neuroimaging Initiative,Evolution model estimation in the atlas space,The role of the kernel function K h (·) here i...,False,the role of the kernel function k h here is to...,evolution model estimation in the atlas space,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5187305,1a550b18-5a4e-4904-a3c2-f978a1044fa4,Survey of Doctorate Recipients,Section II: The Increasing Presence of the For...,"This is seen in Figure 2, which demonstrates h...",False,this is seen in figure 2 which demonstrates ho...,section ii the increasing presence of the fore...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6131058,086282a3-6a47-4b06-b7b7-c093b9a14236,World Ocean Database,,"White, 2017: Climatic regulation of the neurot...",False,white 2017 climatic regulation of the neurotox...,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2001141,b0d90754-8b81-4c8f-b8c7-38d534ef6e53,ADNI,DISCUSSION,It is also the first use of the abbreviated no...,False,it is also the first use of the abbreviated no...,discussion,0,0,0,...,0,1,0,0,0,0,0,0,0,0


Check that the columns were added

In [13]:
df_train_sent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6789067 entries, 0 to 6789066
Data columns (total 37 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   Id             object
 1   dataset_label  object
 2   section_title  object
 3   sent           object
 4   match          bool  
 5   sent_clean     object
 6   section_clean  object
 7   freqData       int64 
 8   freqEdu        int64 
 9   freqSample     int64 
 10  freqNational   int64 
 11  freqSurvey     int64 
 12  freqPublic     int64 
 13  freqAvail      int64 
 14  freqNSF        int64 
 15  freqGov        int64 
 16  freqAccess     int64 
 17  hasData        int64 
 18  hasEdu         int64 
 19  hasSample      int64 
 20  hasNational    int64 
 21  hasSurvey      int64 
 22  hasPublic      int64 
 23  hasAvail       int64 
 24  hasNSF         int64 
 25  hasGov         int64 
 26  hasAccess      int64 
 27  inIntro        int64 
 28  inDisc         int64 
 29  inAbst         int64 
 30  inResult       int

Let's inspect the distribution of the binary "in Introduction"

In [14]:
df_train_sent['inIntro'].value_counts()

0    6428172
1     360895
Name: inIntro, dtype: int64

### has Acronyms

This is reported as a binary and a count


In [15]:
df_train_sent['hasAcronym'] = df_train_sent['sent'].apply(find_acronyms)
df_train_sent['hasAcronym'].value_counts()

0    4503779
1    2285288
Name: hasAcronym, dtype: int64

In [16]:
df_train_sent['freqAcronym'] = df_train_sent['sent'].apply(count_acronyms)
df_train_sent['freqAcronym'].unique()

array([    1,     0,     2,     3,     6,     4,     5,     8,    12,
          11,     7,    10,    15,     9,    17,    19,    16,    14,
          20,    13,    31,    25,    93,   145,    90,    29,    34,
          18,    43,    30,    28,    26,    21,    24,    35,    55,
          22,    23,    44,    33,    41,    42,    57,    75,    27,
          32,    62,    40,    36,    64,    48,    50,    68,    38,
          49,  2081,  6231,   403,    51,    74,    39,    47,   320,
        3157,   178,   281,  3461,   196,    76,   125,   221,    97,
          56,    52,   117,    86,    98,    70,    53,   154,   743,
          45,    37,    94,    46,    58,   126,    67,    72,    78,
         242,   113,   119,   157,    63,   269,    77,    59,   169,
          61,    54,   387,    73,   120,   162,   186,    65,   133,
          83,    87,    82,    89,    88,    69,   435,   188,    80,
         106,   298,   316,   104,   137,   109,   519,   201,   328,
          66,   123,

Workaround: Save df as an intermediate frame in case connection is disrupted

In [17]:
# df_train_sent.to_csv(DATA_DIR + 'train_sents_comb.csv')

## Titles
Check if a sentence has a dataset title from Data.gov or ICPSR
- consider replacing these files with study names and harvested titles

In [6]:
icpsr = pd.read_csv(DATA_DIR + 'labels/icpsr_studies.csv')
icpsr_labels = icpsr['NAME'].apply(clean_text).str.replace('\d+', '')

In [12]:
datagov = pd.read_csv(DATA_DIR + 'labels/kaggle_data_800.csv')
datagov_labels = datagov['title'].apply(clean_text).str.replace('\d+', '')

Workaround: load intermediate frame for final steps

In [16]:
# df_train_sent = pd.read_csv(DATA_DIR + 'train_sents_comb.csv', index_col=0)
# df_train_sent.info()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  mask |= (ar1 == a)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6789067 entries, 0 to 6789066
Data columns (total 39 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   Id             object
 1   dataset_label  object
 2   section_title  object
 3   sent           object
 4   match          bool  
 5   sent_clean     object
 6   section_clean  object
 7   freqData       int64 
 8   freqEdu        int64 
 9   freqSample     int64 
 10  freqNational   int64 
 11  freqSurvey     int64 
 12  freqPublic     int64 
 13  freqAvail      int64 
 14  freqNSF        int64 
 15  freqGov        int64 
 16  freqAccess     int64 
 17  hasData        int64 
 18  hasEdu         int64 
 19  hasSample      int64 
 20  hasNational    int64 
 21  hasSurvey      int64 
 22  hasPublic      int64 
 23  hasAvail       int64 
 24  hasNSF         int64 
 25  hasGov         int64 
 26  hasAccess      int64 
 27  inIntro        int64 
 28  inDisc         int64 
 29  inAbst         int64 
 30  inResult       int

In [None]:
%time df_train_sent['hasICPSRTitle'] = df_train_sent['sent_clean'].apply(lambda x: any([k in x for k in icpsr_labels]))
%time df_train_sent['hasDATAGOVTitle'] = df_train_sent['sent_clean'].apply(lambda x: any([k in x for k in datagov_labels]))

In [None]:
# df_train_sent.to_csv(DATA_DIR + 'train_sents_all.csv')

In [24]:
df_train_sent['hasDATAGOVTitle'].value_counts()

False    6663091
True      125976
Name: hasDATAGOVTitle, dtype: int64

In [25]:
df_train_sent['hasICPSRTitle'].value_counts()

False    6770850
True       18217
Name: hasICPSRTitle, dtype: int64

## Save

Check that all columns have been added

In [22]:
# df_train_sent = pd.read_csv(DATA_DIR + 'train_sents_all.csv', index_col=0)
# df_train_sent.info()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  mask |= (ar1 == a)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6789067 entries, 0 to 6789066
Data columns (total 41 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   Id               object
 1   dataset_label    object
 2   section_title    object
 3   sent             object
 4   match            bool  
 5   sent_clean       object
 6   section_clean    object
 7   freqData         int64 
 8   freqEdu          int64 
 9   freqSample       int64 
 10  freqNational     int64 
 11  freqSurvey       int64 
 12  freqPublic       int64 
 13  freqAvail        int64 
 14  freqNSF          int64 
 15  freqGov          int64 
 16  freqAccess       int64 
 17  hasData          int64 
 18  hasEdu           int64 
 19  hasSample        int64 
 20  hasNational      int64 
 21  hasSurvey        int64 
 22  hasPublic        int64 
 23  hasAvail         int64 
 24  hasNSF           int64 
 25  hasGov           int64 
 26  hasAccess        int64 
 27  inIntro          int64 
 28  inDisc      

Dataframe with all target and feature columns generated in this notebook

In [None]:
df_train_sent.to_csv(DATA_DIR + 'train_sents_all.csv')

## Filter

Optional: filter dataframe to keep only true citances

In [None]:
# df_true = df_train_sent[df_train_sent['match']==True]
# df_true = df_true.drop(columns=['match'])
# df_true['sent'] = df_true['sent'].astype(str)

# df_true

## Demo
Randomly generated example of a citance (a sentence with a data label)

- `section_title`
- `dataset_label`
- `sent`

In [None]:
# example = randint(0, len(df_true))

# print(df_true.iloc[example,2])
# print(df_true.iloc[example,1])
# print(df_true.iloc[example,3])