In [1]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

# Start Spark Session with Spark NLP
# start() functions has two parameters: gpu and spark23
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(spark23=True) is when you have Apache Spark 2.3.x installed
spark = sparknlp.start()

In [2]:
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

Spark NLP version 2.7.3
Apache Spark version: 2.4.4


In [3]:
# text files with the input

import csv
import pandas as pd
df = pd.read_csv("/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner.txt",delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8',header=None,names=["Word","POS","DEREP","TYPE","SENT_NO"])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32488 entries, 0 to 32487
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Word     32488 non-null  object
 1   POS      32488 non-null  object
 2   DEREP    32488 non-null  object
 3   TYPE     32488 non-null  object
 4   SENT_NO  32488 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.2+ MB


In [5]:
from faker import Faker
fake = Faker()

words = []
for i in range(df.shape[0]):
    if df.iloc[i,:].TYPE == 'B-PERSON':
        words.append(fake.name().split()[0])
    elif df.iloc[i,:].TYPE == 'I-PERSON':
        words.append(fake.name().split()[-1])
    else:
        words.append(df.iloc[i,:].Word)

In [6]:
df.Word = words

In [7]:
df.to_csv("/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner_perturb3.txt",sep="\t",index=False,header=False)

In [8]:
import time
import pathlib
path = pathlib.Path("/Users/ramybal/Downloads/bio/spark/test")
flist = [str(f) for f in path.rglob("*.txt") if str(f).endswith("_ner.txt")]

In [9]:
flist

['/Users/ramybal/Downloads/bio/spark/test/onto_nw_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_wb_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_mz_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_pt_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_bn_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_tc_ner.txt']

In [10]:
flist.remove('/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner.txt')

In [11]:
flist

['/Users/ramybal/Downloads/bio/spark/test/onto_nw_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_wb_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_mz_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_pt_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_bn_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_tc_ner.txt']

In [12]:
for f in flist:
    print(f)
    df = pd.read_csv(f,delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8',header=None,names=["Word","POS","DEREP","TYPE","SENT_NO"])
    words = []
    for i in range(df.shape[0]):
        if df.iloc[i,:].TYPE == 'B-PERSON':
            words.append(fake.name().split()[0])
        elif df.iloc[i,:].TYPE == 'I-PERSON':
            words.append(fake.name().split()[-1])
        else:
            words.append(df.iloc[i,:].Word)
    print(len(words))
    print(df.shape[0])
    df.Word = words
    new_filename = f.replace(".txt","_perturb3.txt")
    df.to_csv(new_filename,sep="\t",index=False,header=False)

/Users/ramybal/Downloads/bio/spark/test/onto_nw_ner.txt
49235
49235
/Users/ramybal/Downloads/bio/spark/test/onto_wb_ner.txt
18945
18945
/Users/ramybal/Downloads/bio/spark/test/onto_mz_ner.txt
17875
17875
/Users/ramybal/Downloads/bio/spark/test/onto_pt_ner.txt
16851
16851
/Users/ramybal/Downloads/bio/spark/test/onto_bn_ner.txt
23209
23209
/Users/ramybal/Downloads/bio/spark/test/onto_tc_ner.txt
10976
10976


In [13]:
df = pd.read_csv("/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner.txt",delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8',header=None,names=["Word","POS","DEREP","TYPE","SENT_NO"])

In [14]:
words = []
for i in range(df.shape[0]):
    if df.iloc[i,:].TYPE == 'B-PERSON':
        words.append(fake.name().split()[0])
    elif df.iloc[i,:].TYPE == 'I-PERSON':
        words.append(fake.name().split()[-1])
    else:
        words.append(df.iloc[i,:].Word)

In [15]:
df.Word = words

In [16]:
for k, v in df.groupby("SENT_NO").groups.items():
    temp = df.iloc[v,:]
    temp.to_csv("/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner_perturb4.txt",mode='a',sep="\t",index=False,header=False)
    with open("/Users/ramybal/Downloads/bio/spark/test/onto_bc_ner_perturb4.txt","a") as f:
        f.write("\n")

In [17]:
flist

['/Users/ramybal/Downloads/bio/spark/test/onto_nw_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_wb_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_mz_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_pt_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_bn_ner.txt',
 '/Users/ramybal/Downloads/bio/spark/test/onto_tc_ner.txt']

In [18]:
for f in flist:
    print(f)
    df = pd.read_csv(f,delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8',header=None,names=["Word","POS","DEREP","TYPE","SENT_NO"])
    words = []
    for i in range(df.shape[0]):
        if df.iloc[i,:].TYPE == 'B-PERSON':
            words.append(fake.name().split()[0])
        elif df.iloc[i,:].TYPE == 'I-PERSON':
            words.append(fake.name().split()[-1])
        else:
            words.append(df.iloc[i,:].Word)
    print(len(words))
    print(df.shape[0])
    df.Word = words
    new_filename = f.replace(".txt","_perturb4.txt")
    for k, v in df.groupby("SENT_NO").groups.items():
        temp = df.iloc[v,:]
        temp.to_csv(new_filename,mode='a',sep="\t",index=False,header=False)
        with open(new_filename,"a") as f:
            f.write("\n")

/Users/ramybal/Downloads/bio/spark/test/onto_nw_ner.txt
49235
49235
/Users/ramybal/Downloads/bio/spark/test/onto_wb_ner.txt
18945
18945
/Users/ramybal/Downloads/bio/spark/test/onto_mz_ner.txt
17875
17875
/Users/ramybal/Downloads/bio/spark/test/onto_pt_ner.txt
16851
16851
/Users/ramybal/Downloads/bio/spark/test/onto_bn_ner.txt
23209
23209
/Users/ramybal/Downloads/bio/spark/test/onto_tc_ner.txt
10976
10976
