In [1]:
# Clone git to get the data
!git clone https://github.com/rasyidpurnama/abd2022

Cloning into 'abd2022'...
remote: Enumerating objects: 13, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 13 (delta 2), reused 3 (delta 0), pack-reused 0[K
Unpacking objects: 100% (13/13), done.


In [2]:
cd abd2022

/content/abd2022


In [3]:
# Importing packages 
import numpy as np
import pandas as pd
from functools import reduce

In [4]:
# Read the data
df = pd.read_csv("mtsamples_20220222.csv").drop_duplicates().dropna().reset_index(drop=True)
df.head()

Unnamed: 0,medical_speciality,sample_name,description,transcription,keywords
0,Allergy / Immunology,Allergic Rhinitis,A 23-year-old white female presents with compl...,SUBJECTIVE: This 23-year-old white female pres...,"allergy / immunology, allergic rhinitis, aller..."
1,Allergy / Immunology,Allergy Evaluation Consult,"Acute allergic reaction, etiology uncertain, h...",HISTORY: A 34-year-old male presents today sel...,"allergy / immunology, keflex, acute allergic r..."
2,Allergy / Immunology,Asthma in a 5-year-old,Mother states he has been wheezing and coughing.,CHIEF COMPLAINT: This 5-year-old male presents...,"allergy / immunology, breathing treatment, air..."
3,Allergy / Immunology,Chronic Sinusitis,Patient having severe sinusitis about two to t...,HISTORY: I had the pleasure of meeting and eva...,"allergy / immunology, nasal congestion, facial..."
4,Allergy / Immunology,Evaluation of Allergies,"Chronic glossitis, xerostomia, probable enviro...",HISTORY: A 55-year-old female presents self-re...,"allergy / immunology, chronic glossitis, xeros..."


In [5]:
# only use transcription columns for input  
df['text'] = df.apply(lambda x: x['transcription'], axis=1).str.lower()

# a little bit preprocesing delete unknown characters
text_to_filter = ' '.join(df.text.values.flatten().tolist())
import re, numpy as np
excluded = [ st for st in np.unique(re.findall(r'.', text_to_filter)) if not re.search(r'[a-z]', st) and not re.search(r'[0-9]', st) and st not in ['%','/','-','!',' ']]
print(excluded)
for exc in excluded:
    df['text'] = df['text'].str.replace(exc, '')
df['text'] = df['text'].str.replace('\n', ' ')
df['text'] = df['text'].astype(str)
df.head(5)

['"', '#', '$', '&', "'", '(', ')', '*', '+', ',', '.', ':', ';', '<', '=', '>', '?', '@', '[', ']', '_', '{', '|', '}', '©', '®', '°', 'µ', '·', 'º', '¼', '½', 'è', 'é', 'ü', '–', '’', '“', '”', '…']


  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,medical_speciality,sample_name,description,transcription,keywords,text
0,Allergy / Immunology,Allergic Rhinitis,A 23-year-old white female presents with compl...,SUBJECTIVE: This 23-year-old white female pres...,"allergy / immunology, allergic rhinitis, aller...",subjective this 23-year-old white female prese...
1,Allergy / Immunology,Allergy Evaluation Consult,"Acute allergic reaction, etiology uncertain, h...",HISTORY: A 34-year-old male presents today sel...,"allergy / immunology, keflex, acute allergic r...",history a 34-year-old male presents today self...
2,Allergy / Immunology,Asthma in a 5-year-old,Mother states he has been wheezing and coughing.,CHIEF COMPLAINT: This 5-year-old male presents...,"allergy / immunology, breathing treatment, air...",chief complaint this 5-year-old male presents ...
3,Allergy / Immunology,Chronic Sinusitis,Patient having severe sinusitis about two to t...,HISTORY: I had the pleasure of meeting and eva...,"allergy / immunology, nasal congestion, facial...",history i had the pleasure of meeting and eval...
4,Allergy / Immunology,Evaluation of Allergies,"Chronic glossitis, xerostomia, probable enviro...",HISTORY: A 55-year-old female presents self-re...,"allergy / immunology, chronic glossitis, xeros...",history a 55-year-old female presents self-ref...


In [6]:
# get 10 random samples for each top 25 medical_speciality  
df = pd.concat([df[df.medical_speciality==_].sample(100, random_state=10) for _ in df.medical_speciality.value_counts()[:10].index], axis=0).reset_index(drop=False)

In [7]:
# save to csv on column text 
df[['text']].to_csv('data.csv', index=False)
pd.read_csv('data.csv').shape

(1000, 1)

In [8]:
df.medical_speciality.value_counts()[:10]

Surgery                          100
Consult - History and Phy.       100
Cardiovascular / Pulmonary       100
Orthopedic                       100
Radiology                        100
General Medicine                 100
Gastroenterology                 100
Neurology                        100
SOAP / Chart / Progress Notes    100
Obstetrics / Gynecology          100
Name: medical_speciality, dtype: int64

### Perbandingan menggunakan sparksql dan pandas

In [31]:
%%time
# duplikasi baris pandas 
pandas = pd.concat([df[['medical_speciality','keywords','text']]]*1000).reset_index(drop=True)

CPU times: user 169 ms, sys: 29 ms, total: 198 ms
Wall time: 239 ms


In [20]:
# save to csv, then load to spark dataframe
pandas.to_csv('spark_data.csv',index=False)

In [23]:
!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
!bash colab_setup.sh

# Install sparknlp-display
! pip install spark-nlp-display

--2022-03-30 22:51:31--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1453 (1.4K) [text/plain]
Saving to: ‘colab_setup.sh’


2022-03-30 22:51:31 (17.7 MB/s) - ‘colab_setup.sh’ saved [1453/1453]

setup Colab for PySpark 3.0.3 and Spark NLP 3.4.2
Installing PySpark 3.0.3 and Spark NLP 3.4.2
[K     |████████████████████████████████| 209.1 MB 58 kB/s 
[K     |████████████████████████████████| 142 kB 36.9 MB/s 
[K     |████████████████████████████████| 198 kB 33.0 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
Collecting spark-nlp-display
  Downloading spark_nlp_display-1.9.1-py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 

In [24]:
import sparknlp
from pyspark.ml import Pipeline
from sparknlp.base import *
import pyspark.sql.functions as F
from sparknlp.annotator import *

spark = sparknlp.start()

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

Spark NLP version 3.4.2
Apache Spark version: 3.0.3


In [33]:
sparkdata = spark.read.csv('spark_data.csv', header=True, inferSchema=True)
sparkdata.show(truncate=90)
sparkdata.count()

+------------------+------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------+
|medical_speciality|                                                                                  keywords|                                                                                      text|
+------------------+------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------+
|           Surgery|surgery, range-of-motion, hip, total hip replacement, gluteus maximus, femoral head, su...|procedure total hip replacement  procedure description the patient was bought to the op...|
|           Surgery|surgery, lens implantation, anterior chamber, lid speculum, eye, sinskey hook, cataract...|procedure performed cataract extraction with lens implantation right eye  des

1000000

`Pandas - Menghitung angka dan menghapusnya`

In [32]:
%%time
pandas['text_prep'] = pandas['text']
for i in range(10):
    pandas['text_prep'] = pandas.text_prep.replace(str(i), "")
    pandas['count_'+str(i)] = pandas.text.str.len() - pandas.text_prep.str.len()

CPU times: user 9.44 s, sys: 568 ms, total: 10 s
Wall time: 10.3 s


`SparkSQL - Menghitung angka dan menghapusnya`

In [34]:
%%time
sparkdata = sparkdata.withColumn('text_prep', sparkdata.text)
for i in range(10):
    sparkdata = sparkdata.withColumn('text_prep', F.regexp_replace('text_prep',str(i),""))
    sparkdata = sparkdata.withColumn('count_'+str(i), F.length("text") - F.length("text_prep"))

CPU times: user 53.8 ms, sys: 6.69 ms, total: 60.5 ms
Wall time: 1.3 s


SparkSQL jauh lebih cepat dari pada pandas ketika memproses data yang besar