In [13]:
import pandas as pd
import numpy as np
import sentencepiece as spm
import nltk
import ast
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import time

In [14]:
ncbi_com_0 = pd.read_csv("data/ncbi_comm_use_000000000000.csv")

In [15]:
ncbi_com_1 = pd.read_csv("data/ncbi_comm_use_000000000001.csv")

In [16]:
ncbi_non_com_0 =  pd.read_csv("data/ncbi_non_comm_use_000000000000.csv")

In [17]:
ncbi_non_com_1 =  pd.read_csv("data/ncbi_non_comm_use_000000000001.csv")

In [18]:
ncbi_com_0.shape

(5958, 5)

In [19]:
ncbi_com_1.shape

(4790, 5)

In [20]:
ncbi_non_com_0.shape

(7924, 5)

In [21]:
ncbi_non_com_1.shape

(9445, 5)

In [22]:
df1 = ncbi_com_0

In [23]:
df1.shape

(5958, 5)

In [24]:
df2 = pd.concat([df1, ncbi_com_1])

In [25]:
df2.shape

(10748, 5)

In [26]:
df3 = pd.concat([df2, ncbi_non_com_0])

In [27]:
df3.shape

(18672, 5)

In [28]:
df4 = pd.concat([df3, ncbi_non_com_1])

In [29]:
df4.shape

(28117, 5)

# Data Preprocessing

In [1]:
def remove_newline_char(text):
    text = text.replace("\n", " ")
    return text

def nltk_sent_tokenize(text):
    text = sent_tokenize(text)
    return text

def contains_coronavirus(text):
    if "coronavirus" in text.lower():
        return 1
    else:
        return 0
    
def contains_COVID(text):
    if "COVID" in text:
        return 1
    else:
        return 0

In [31]:
def preprocess(df):
    # remove rows that have null Body 
    df = df[~df['Body'].isnull()]
    df['Body'] = df['Body'].apply(remove_newline_char)
    df['Body_sents'] = df['Body'].apply(nltk_sent_tokenize)
    df['Body_tokens'] = df['Body'].apply(word_tokenize)
    df['len_body'] = df['Body_tokens'].apply(lambda x: len(x))
    df['has_coronavirus'] = df['Body'].apply(contains_coronavirus)
    df['has_COVID'] = df['Body'].apply(contains_COVID)
    df['len_sents'] = df['Body_sents'].apply(lambda x: len(x))
    return df

# Build and save corpus

In [32]:
def build_raw_corpus(df):
    raw_corpus = []
    for i, row in df.iterrows():
        raw_corpus += row['Body_sents']
    return raw_corpus

In [33]:
def save_corpus_as_txt(filename, corpus):
    with open(filename, 'w') as f:
        for sent in corpus:
            f.write(sent)
            f.write('\n')
    f.close()

In [34]:
def build_tokenizer_input(df, filename):
    raw_corpus = build_raw_corpus(df)
    save_corpus_as_txt(filename, raw_corpus)

# Train SentencePiece tokenizer

In [35]:
def train_tokenizer(model_prefix, input_file, vocab_size):
    spm.SentencePieceTrainer.train('--model_prefix={} --input={} --vocab_size={}'.format(model_prefix, 
                                                                                         input_file, vocab_size))

# Load model

In [36]:
def load_model(model_file):
    sp = spm.SentencePieceProcessor()
    sp.Load(model_file)
    return sp

# Tokenize text

In [37]:
def sp_tokenize(model, text):
    tokenized_text = model.EncodeAsPieces(text)
    return tokenized_text

# Experiments

In [38]:
# rows: 5958
# vocab_size=5000

In [39]:
# preprocess data
t1 = time.time()
df1 = preprocess(df1)
t2 = time.time()
print ("Time:", (t2-t1)/60)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: htt

Time: 5.109312570095062


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [41]:
# build corpus
input_file_1 = "sample_input_1.txt"
t1 = time.time()
raw_corpus_1 = build_tokenizer_input(df1, input_file_1)
t2 = time.time()
print ("Time:", (t2-t1)/60)

Time: 0.05040566523869832


In [42]:
# train sp tokenizer
model_prefix_1 = "m1"
vocab_size = 5000
t1 = time.time()
train_tokenizer(model_prefix_1, input_file_1, vocab_size)
t2 = time.time()
print ("Time:", (t2-t1)/60)

Time: 6.44949103196462


In [43]:
# load model
model_file_1 = model_prefix_1 + ".model"
sp1 = load_model(model_file_1)

In [44]:
# tokenize text
text = "This is a novel coronavirus disease."
tokenized_text = sp_tokenize(sp1, text)

In [45]:
tokenized_text

['▁This', '▁is', '▁a', '▁novel', '▁cor', 'on', 'a', 'virus', '▁disease', '.']

# Some EDA on data

In [46]:
df1.head()

Unnamed: 0,Refs,Body,Front,Meta,Filename,Body_sents,Body_tokens,len_body,has_coronavirus,has_COVID,len_sents
19,,Aims and Scope Molecular Genetics & Genomic Me...,Mol Genet Genomic MedMol Genet Genomic Medmgg3...,,comm_use.I-N.txt.tar.gz-unpacked/Mol_Genet_Gen...,[Aims and Scope Molecular Genetics & Genomic M...,"[Aims, and, Scope, Molecular, Genetics, &, Gen...",622,0,0,20
21,,Aims and Scope Molecular Genetics & Genomic Me...,Mol Genet Genomic MedMol Genet Genomic Medmgg3...,,comm_use.I-N.txt.tar.gz-unpacked/Mol_Genet_Gen...,[Aims and Scope Molecular Genetics & Genomic M...,"[Aims, and, Scope, Molecular, Genetics, &, Gen...",619,0,0,20
22,,The original article to which this erratum ref...,Mol Genet Genomic MedMol Genet Genomic Medmgg3...,,comm_use.I-N.txt.tar.gz-unpacked/Mol_Genet_Gen...,[The original article to which this erratum re...,"[The, original, article, to, which, this, erra...",135,0,0,4
23,,Aims and Scope Molecular Genetics & Genomic Me...,Mol Genet Genomic MedMol Genet Genomic Medmgg3...,,comm_use.I-N.txt.tar.gz-unpacked/Mol_Genet_Gen...,[Aims and Scope Molecular Genetics & Genomic M...,"[Aims, and, Scope, Molecular, Genetics, &, Gen...",619,0,0,20
24,,In the house of my grandparents in Heinzendorf...,Mol Genet Genomic MedMol Genet Genomic Med10.1...,,comm_use.I-N.txt.tar.gz-unpacked/Mol_Genet_Gen...,[In the house of my grandparents in Heinzendor...,"[In, the, house, of, my, grandparents, in, Hei...",1441,0,0,53


In [48]:
df1.has_coronavirus.value_counts()

0    5923
1      14
Name: has_coronavirus, dtype: int64

In [50]:
df1.has_COVID.value_counts()

0    5937
Name: has_COVID, dtype: int64

In [51]:
np.mean(df1['len_body'])

5638.820953343439

In [52]:
np.mean(df1['len_sents'])

218.91510864072765