In [1]:
!pip install rake-nltk
!pip install gensim==3.6.0
!pip install yake

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6
[0mCollecting gensim==3.6.0
  Downloading gensim-3.6.0.tar.gz (23.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.1/23.1 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: gensim
  Building wheel for gensim (setup.py) ... [?25ldone
[?25h  Created wheel for gensim: filename=gensim-3.6.0-cp37-cp37m-linux_x86_64.whl size=24613059 sha256=6cfe1f838b3392941cf780f9ecd4b0a6876a221df710139e3359862e4acd7aac
  Stored in directory: /root/.cache/pip/wheels/53/c8/f9/afb722099bdb5d73e5807019ce1512fd065502ccc15ea2b5bd
Successfully built gensim
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 4.0.1
    Uninstalling gensim-4.0.1:
      Successfully uninstall

In [2]:
import numpy as np 
import pandas as pd 
from rake_nltk import Rake
import re
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from nltk.corpus import stopwords
nltk.download('stopwords')
from operator import itemgetter
import networkx as nx
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

nltk_stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def clean_text(txt):
    txt = txt.strip('][').split(', ')
    clean_txt = []
    for sent in txt:
        s = re.sub(r'[^\w\s]', '', sent)
        if len(s)>0:
            s += "."
            clean_txt.append(s)
    
    final_txt = []
    for c in clean_txt:
        words = word_tokenize(c.lower())
        cleaned_words = []
        for w in words:
            if w not in nltk_stop_words:
                cleaned_words.append(w)
        final_txt.append(" ".join(cleaned_words))
    return ". ".join(final_txt)


def clean_transcription_text(txt):
    s = re.sub(r'[^\w\s]', '', txt)
    words = word_tokenize(s.lower())
    cleaned_words = []
    for w in words:
        if w not in nltk_stop_words:
            cleaned_words.append(w)
    final_txt = []   
    final_txt.append(" ".join(cleaned_words))
    return ". ".join(final_txt)


def get_stop_words(cleaned_symptoms, min_count=5, max_count=90):
    cleaned_sym_list = [word_tokenize(sent.lower()) for sent in cleaned_symptoms.tolist()]
    model = Word2Vec(sentences=cleaned_sym_list, size=1000, window=5, min_count=1, workers=4)
    model.save("word2vec.model")
    stop_words = set()
    word_list = []
    for w in model.wv.vocab:
        word_list.append( (w, model.wv.vocab[w].count) )
    word_list.sort(key=lambda x:x[1], reverse=True)
    
    print("Total vocabulary = ", len(word_list))
    for w,c in word_list:
        if c<=min_count or c>=max_count:
            stop_words.add(w)
            
    print("identified ", len(stop_words),  " stopwords.")
    
    return stop_words, model

def remove_stop_words(x, stop_words):
    words = word_tokenize(x)
    clean_words = []
    for w in words:
        if (w not in stop_words) :
            clean_words.append(w)
    return " ".join(clean_words)


def get_keywords(text):
    r = Rake()
    r.extract_keywords_from_text(text)
    list_terms = list(set(r.get_ranked_phrases()))
    keywords = set()
    for sent in list_terms:
        words = word_tokenize(sent)
        keywords.update(words)
    return list(keywords)

def get_vector(model, keywords):
    word_vectors = []
    for w in keywords:
        v = model.wv[w].reshape(1,-1)
        word_vectors.append(v)
        
    word_vectors = np.array(word_vectors)
    mean_vec = np.mean(word_vectors, axis=0)
    return mean_vec

In [4]:
# disease_components = pd.read_csv("/kaggle/input/diseases-dataset/disease_components.csv", encoding='latin1')
# disease_components.dropna(inplace=True)
# disease_components['cleaned_symptoms'] = disease_components.apply(lambda x: clean_text(x['Symptoms']), axis=1)
# stop_words, word2ec_model = get_stop_words(disease_components['cleaned_symptoms'], min_count=5, max_count=100)
# disease_components['cleaned_symptoms'] = disease_components.apply(lambda x: remove_stop_words(x['cleaned_symptoms'], stop_words), axis=1)
# disease_components['keywords'] = disease_components.apply(lambda x: get_keywords(x['cleaned_symptoms']), axis=1)

In [5]:
# disease_components['symptom_vector'] = disease_components.apply(lambda x: get_vector(word2ec_model, x['keywords']), axis=1)

## MT Samples dataset

In [6]:
df = pd.read_csv("/kaggle/input/medicaltranscriptions/mtsamples.csv")
df = df[['medical_specialty', 'transcription']]

In [7]:
le = preprocessing.LabelEncoder()
df['medical_specialty_encoded'] = le.fit_transform(df['medical_specialty'])
df.dropna(inplace=True)

In [8]:
df['transcription_cleaned'] = df.apply(lambda x: clean_transcription_text(x['transcription']), axis=1)

In [9]:
stop_words, word2ec_model = get_stop_words(df['transcription_cleaned'], min_count=5, max_count=100)

Total vocabulary =  44687
identified  32340  stopwords.


In [10]:
df['transcription_cleaned'] = df.apply(lambda x: remove_stop_words(x['transcription_cleaned'], stop_words), axis=1)
df['transcription_keywords'] = df.apply(lambda x: get_keywords(x['transcription_cleaned']), axis=1)
df['symptom_vector'] = df.apply(lambda x: get_vector(word2ec_model, x['transcription_keywords']), axis=1)
df.dropna(inplace=True)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [11]:
vectors = [k.tolist()[0] for k in df['symptom_vector'].tolist()]
df_final = pd.DataFrame(vectors, columns=["f_"+str(i) for i in range(1000)])
df_final['target'] = df['medical_specialty_encoded'].tolist()
X = df_final.drop(columns=['target'])
y = df_final['target']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=42)

In [13]:
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto', max_iter=10000).fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1_ = f1_score(y_test, y_pred, average='weighted')
print("accuracy = ", acc)
print("F1 score = ", f1_)


accuracy =  0.35683629675045986
F1 score =  0.26818154736675454


In [14]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1_ = f1_score(y_test, y_pred, average='weighted')
print("accuracy = ", acc)
print("F1 score = ", f1_)

accuracy =  0.1894543225015328
F1 score =  0.1829017368425353


In [15]:

from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None).fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1_ = f1_score(y_test, y_pred, average='weighted')
print("accuracy = ", acc)
print("F1 score = ", f1_)

accuracy =  0.21949724095646841
F1 score =  0.20826109936886789


In [21]:
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0, tol=1e-5).fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1_ = f1_score(y_test, y_pred, average='weighted')
print("accuracy = ", acc)
print("F1 score = ", f1_)

accuracy =  0.3703249540159411
F1 score =  0.2873525152146599


In [16]:
# main_df = pd.read_csv("/kaggle/input/diseases-and-symptoms/main.csv")

# disease_symp = defaultdict(list)

# def clean_data(x):
#     conditions = x['label'].split('^')
#     conditions = [ k.split('_')[1] for k in conditions]
    
#     symptoms = []
#     for symp in x.keys():
#         if symp!='label' and symp!='frequency' and x[symp]==1:
#             symptoms.append( symp.split('_')[1] )
            
#     for c in conditions:
#         disease_symp[c.lower()] = symptoms
        
    
# _ = main_df.apply(lambda x: clean_data(x), axis=1)

In [17]:
# diseases = set(disease_symp.keys())

# big_dataset = set(disease_components['*'].tolist())
# big_dataset = [k.lower() for k in big_dataset]

# diseases.intersection(big_dataset)

In [18]:
# def get_bigram_network(textlist):
#     G = nx.Graph()
#     for text in textlist:
#         words = word_tokenize(text.lower())
#         for i in range(len(words)-1):
#             a = words[i]
#             b = words[i+1]
#             G.add_edge(a, b)
        
#     return G

# G = get_bigram_network(disease_components['cleaned_symptoms'].tolist())
# G.number_of_nodes()

In [19]:
# def knn(graph, node, n):
#     return list(map(itemgetter(1),
#                     sorted([(e)
#                             for e in graph.edges(node, data=True)])[:n]))

# node_degree = []
# for n in G.nodes:
#     if G.degree[n]!=0:
#         node_degree.append((n, G.degree[n]))
        
# node_degree.sort(key = lambda x : x[1], reverse=True)

In [20]:
# from gensim.summarization import keywords
# print(keywords(cleaned_text))

# import yake
# kw_extractor = yake.KeywordExtractor()
# language = "en"
# max_ngram_size = 3
# deduplication_threshold = 0.9
# numOfKeywords = 20
# custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
# keywords = custom_kw_extractor.extract_keywords(text)
# for kw in keywords:
#     print(kw)