In [None]:
# Clone git to get the data
!git clone https://github.com/rasyidpurnama/abd2022

In [2]:
!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
!bash colab_setup.sh

# Install sparknlp-display
! pip install spark-nlp-display

--2022-03-30 22:10:12--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1453 (1.4K) [text/plain]
Saving to: ‘colab_setup.sh’


2022-03-30 22:10:12 (24.3 MB/s) - ‘colab_setup.sh’ saved [1453/1453]

setup Colab for PySpark 3.0.3 and Spark NLP 3.4.2
Installing PySpark 3.0.3 and Spark NLP 3.4.2
[K     |████████████████████████████████| 209.1 MB 43 kB/s 
[K     |████████████████████████████████| 142 kB 2.7 MB/s 
[K     |████████████████████████████████| 198 kB 42.4 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
Collecting spark-nlp-display
  Downloading spark_nlp_display-1.9.1-py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 k

In [3]:
cd abd2022

/content/abd2022


In [4]:
# Importing packages 
import numpy as np
import pandas as pd
from functools import reduce

### Clustering with K-Means on mtsamples dataset


In [34]:
# Read the data
df = pd.read_csv("mtsamples_20220222.csv").drop_duplicates().dropna().reset_index(drop=True)
df.head()

Unnamed: 0,medical_speciality,sample_name,description,transcription,keywords
0,Allergy / Immunology,Allergic Rhinitis,A 23-year-old white female presents with compl...,SUBJECTIVE: This 23-year-old white female pres...,"allergy / immunology, allergic rhinitis, aller..."
1,Allergy / Immunology,Allergy Evaluation Consult,"Acute allergic reaction, etiology uncertain, h...",HISTORY: A 34-year-old male presents today sel...,"allergy / immunology, keflex, acute allergic r..."
2,Allergy / Immunology,Asthma in a 5-year-old,Mother states he has been wheezing and coughing.,CHIEF COMPLAINT: This 5-year-old male presents...,"allergy / immunology, breathing treatment, air..."
3,Allergy / Immunology,Chronic Sinusitis,Patient having severe sinusitis about two to t...,HISTORY: I had the pleasure of meeting and eva...,"allergy / immunology, nasal congestion, facial..."
4,Allergy / Immunology,Evaluation of Allergies,"Chronic glossitis, xerostomia, probable enviro...",HISTORY: A 55-year-old female presents self-re...,"allergy / immunology, chronic glossitis, xeros..."


In [35]:
# only use transcription columns for input  
df['text'] = df.apply(lambda x: x['transcription'], axis=1).str.lower()

# a little bit preprocesing delete unknown characters
text_to_filter = ' '.join(df.text.values.flatten().tolist())
import re, numpy as np
excluded = [ st for st in np.unique(re.findall(r'.', text_to_filter)) if not re.search(r'[a-z]', st) and not re.search(r'[0-9]', st) and st not in ['%','/','-','!',' ']]
print(excluded)
for exc in excluded:
    df['text'] = df['text'].str.replace(exc, '')
df['text'] = df['text'].str.replace('\n', ' ')
df['text'] = df['text'].astype(str)
df.head(5)

['"', '#', '$', '&', "'", '(', ')', '*', '+', ',', '.', ':', ';', '<', '=', '>', '?', '@', '[', ']', '_', '{', '|', '}', '©', '®', '°', 'µ', '·', 'º', '¼', '½', 'è', 'é', 'ü', '–', '’', '“', '”', '…']


  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,medical_speciality,sample_name,description,transcription,keywords,text
0,Allergy / Immunology,Allergic Rhinitis,A 23-year-old white female presents with compl...,SUBJECTIVE: This 23-year-old white female pres...,"allergy / immunology, allergic rhinitis, aller...",subjective this 23-year-old white female prese...
1,Allergy / Immunology,Allergy Evaluation Consult,"Acute allergic reaction, etiology uncertain, h...",HISTORY: A 34-year-old male presents today sel...,"allergy / immunology, keflex, acute allergic r...",history a 34-year-old male presents today self...
2,Allergy / Immunology,Asthma in a 5-year-old,Mother states he has been wheezing and coughing.,CHIEF COMPLAINT: This 5-year-old male presents...,"allergy / immunology, breathing treatment, air...",chief complaint this 5-year-old male presents ...
3,Allergy / Immunology,Chronic Sinusitis,Patient having severe sinusitis about two to t...,HISTORY: I had the pleasure of meeting and eva...,"allergy / immunology, nasal congestion, facial...",history i had the pleasure of meeting and eval...
4,Allergy / Immunology,Evaluation of Allergies,"Chronic glossitis, xerostomia, probable enviro...",HISTORY: A 55-year-old female presents self-re...,"allergy / immunology, chronic glossitis, xeros...",history a 55-year-old female presents self-ref...


In [37]:
# get 10 random samples for each top 25 medical_speciality  
df = pd.concat([df[df.medical_speciality==_].sample(100, random_state=10) for _ in df.medical_speciality.value_counts()[:10].index], axis=0).reset_index(drop=False)

In [38]:
# save to csv on column text 
df[['text']].to_csv('data.csv', index=False)
pd.read_csv('data.csv').shape

(1000, 1)

In [39]:
df.medical_speciality.value_counts()[:25]

Surgery                          100
Consult - History and Phy.       100
Cardiovascular / Pulmonary       100
Orthopedic                       100
Radiology                        100
General Medicine                 100
Gastroenterology                 100
Neurology                        100
SOAP / Chart / Progress Notes    100
Obstetrics / Gynecology          100
Name: medical_speciality, dtype: int64

### Start using SparkNLP


In [40]:
import sparknlp
from pyspark.ml import Pipeline
from sparknlp.base import *
import pyspark.sql.functions as F
from sparknlp.annotator import *

spark = sparknlp.start()

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

Spark NLP version 3.4.2
Apache Spark version: 3.0.3


In [41]:
data = spark.read.csv('data.csv', header=True, inferSchema=True)
data.show(truncate=90)
data.count()

+------------------------------------------------------------------------------------------+
|                                                                                      text|
+------------------------------------------------------------------------------------------+
|procedure total hip replacement  procedure description the patient was bought to the op...|
|procedure performed cataract extraction with lens implantation right eye  description o...|
|preop diagnoses 1 left pilon fracture 2 left great toe proximal phalanx fracture  posto...|
|preoperative diagnosis recurrent bladder tumors  postoperative diagnosis recurrent blad...|
|preoperative diagnosis degenerative disk disease at l4-l5 and l5-s1  postoperative diag...|
|the patient was consented for skin biopsy the complications instructions as to how the ...|
|preoperative diagnosis varicose veins  postoperative diagnosis varicose veins  procedur...|
|preoperative diagnosis t12 compression fracture with cauda equina syn

1000

In [42]:
text_col = 'text'
text = data.select(text_col).filter(F.col(text_col).isNotNull())

In [43]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer
from nltk.corpus import stopwords
from sparknlp.annotator import StopWordsCleaner
from sparknlp.annotator import NGramGenerator
from sparknlp.annotator import PerceptronModel
from sparknlp.base import Finisher
from pyspark.ml import Pipeline
from sparknlp.annotator import Normalizer
from sparknlp.annotator import LemmatizerModel
import nltk

documentAssembler = DocumentAssembler() \
     .setInputCol(text_col) \
     .setOutputCol('document')

tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('tokenized')

normalizer = Normalizer() \
     .setInputCols(['tokenized']) \
     .setOutputCol('normalized') \
     .setLowercase(True)

lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemmatized')

nltk.download('stopwords')

eng_stopwords = stopwords.words('english')

stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemmatized']) \
     .setOutputCol('unigrams') \
     .setStopWords(eng_stopwords)

finisher = Finisher() \
     .setInputCols(['unigrams']) \

pipeline = Pipeline() \
     .setStages([documentAssembler,                  
                 tokenizer,
                 normalizer,                  
                 lemmatizer,                  
                 stopwords_cleaner, 
                 finisher])                  

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
processed_text = pipeline.fit(text).transform(text)
processed_text.limit(5).show()

+--------------------+--------------------+
|                text|   finished_unigrams|
+--------------------+--------------------+
|procedure total h...|[procedure, total...|
|procedure perform...|[procedure, perfo...|
|preop diagnoses 1...|[preop, diagnosis...|
|preoperative diag...|[preoperative, di...|
|preoperative diag...|[preoperative, di...|
+--------------------+--------------------+



In [45]:
from pyspark.ml.feature import CountVectorizer

tfizer = CountVectorizer(inputCol='finished_unigrams', outputCol='tf_features')
tf_model = tfizer.fit(processed_text)
tf_result = tf_model.transform(processed_text)

In [46]:
from pyspark.ml.feature import IDF

idfizer = IDF(inputCol='tf_features', outputCol='tf_idf_features')
idf_model = idfizer.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator


silhouette_score=[]
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='tf_idf_features', \
                                metricName='silhouette', distanceMeasure='squaredEuclidean')
for i in range(3,12):
    
    KMeans_algo=KMeans(featuresCol='tf_idf_features', k=i)
    
    KMeans_fit=KMeans_algo.fit(tfidf_result)
    
    output=KMeans_fit.transform(tfidf_result)
    
    score=evaluator.evaluate(output)
    
    silhouette_score.append(score)
    
    print("Silhouette Score:",score)

In [None]:
#Visualizing the silhouette scores in a plot
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,1, figsize =(8,6))
ax.plot(range(2,10),silhouette_score)
ax.set_xlabel("k")
ax.set_ylabel("cost")

In [50]:
# Trains a k-means model.

KMeans1=KMeans(featuresCol='tf_idf_features', k=10)
KMeans_fit=KMeans1.fit(tfidf_result)

In [51]:
result = KMeans_fit.transform(tfidf_result).select('prediction')
result.show()
df_result = result.toPandas()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
+----------+
only showing top 20 rows



In [52]:
df_result.columns = ['cluster']
data = pd.concat([df,df_result],axis=1)
data[['medical_speciality', 'cluster']].groupby('medical_speciality').agg(list)

Unnamed: 0_level_0,cluster
medical_speciality,Unnamed: 1_level_1
Cardiovascular / Pulmonary,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
Consult - History and Phy.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, ..."
Gastroenterology,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
General Medicine,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
Neurology,"[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
Obstetrics / Gynecology,"[0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, 0, 7, 0, ..."
Orthopedic,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
Radiology,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
SOAP / Chart / Progress Notes,"[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
Surgery,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [54]:
data[['medical_speciality', 'cluster']].groupby('medical_speciality').agg(pd.Series.mode)

Unnamed: 0_level_0,cluster
medical_speciality,Unnamed: 1_level_1
Cardiovascular / Pulmonary,0
Consult - History and Phy.,0
Gastroenterology,0
General Medicine,0
Neurology,0
Obstetrics / Gynecology,0
Orthopedic,0
Radiology,0
SOAP / Chart / Progress Notes,0
Surgery,0


Sebaran cluster belum cukup mengikuti pengelompokan berdasarkan medical_speciality