Upload diseases.csv

In [None]:
import pandas as pd

df = pd.read_csv('diseases_2.csv')

In [None]:
if "Unnamed: 0" in df.columns:
  df = df.rename(columns={"Unnamed: 0": "name"})
df

Unnamed: 0,name,description,causes,frequency,inheritance
0,Tangier disease,Tangier disease is an inherited disorder chara...,Mutations in theABCA1gene cause Tangier diseas...,Tangier disease is a rare disorder with approx...,This condition is inherited in anautosomal rec...
1,Ataxia-telangiectasia,Ataxia-telangiectasia is a rare inherited diso...,Variants (also called mutations) in theATMgene...,"Ataxia-telangiectasia occurs in 1 in 40,000 to...",Ataxia-telangiectasia is inherited in anautoso...
2,Alopecia areata,Alopecia areata is a common disorder that caus...,The causes of alopecia areata are complex and ...,"Alopecia areata affects 1 in every 500 to 1,00...",The inheritance pattern of alopecia areata is ...
3,Triple A syndrome,Triple A syndrome is an inherited condition ch...,Mutations in theAAASgene cause triple A syndro...,"Triple A syndrome is a rare condition, althoug...",This condition is inherited in anautosomal rec...
4,Aromatic l-amino acid decarboxylase deficiency,Aromatic l-amino acid decarboxylase (AADC) def...,Mutations in theDDCgene cause AADC deficiency....,AADC deficiency is a rare disorder. Only about...,This condition is inherited in anautosomal rec...
...,...,...,...,...,...
321,Spastic paraplegia type 11,Spastic paraplegia type 11 is part of a group ...,Mutations in theSPG11gene cause spastic parapl...,Over 100 cases of spastic paraplegia type 11 h...,This condition is inherited in anautosomal rec...
322,Spastic paraplegia type 49,Spastic paraplegia type 49 is part of a group ...,Spastic paraplegia type 49 is caused by mutati...,Spastic paraplegia type 49 is a rare disorder....,This condition is inherited in anautosomal rec...
323,JAK3-deficient severe combined immunodeficiency,JAK3-deficient severe combined immunodeficienc...,JAK3-deficient SCID is caused by mutations in ...,JAK3-deficient SCID accounts for an estimated ...,This condition is inherited in anautosomal rec...
324,Pulmonary arterial hypertension,Pulmonary arterial hypertension is a progressi...,Mutations in theBMPR2gene are the most common ...,"In the United States, about 1,000 new cases of...",Pulmonary arterial hypertension is usually spo...


In [None]:
df.name.iloc[0]

'Tangier disease'

In [None]:
import re

def mask(text, string, replacement):
  if len(string) > 30:
    start = string[:10]
    end = string[-6:]
    middle = '.{5,' + str(len(string)) +'}'
    pattern = re.compile(start+middle+end,re.IGNORECASE)
  else:
     pattern = re.compile(string, re.IGNORECASE)
 
  new_text = pattern.sub(replacement, text)
  return new_text

In [None]:
def all_matches(text1,text2, min_len = 5):
  tokens1 = text1.split()
  tokens2 = text2.split()

  all_matches = []

  for tok in tokens2:
    if len(tok) <= min_len: continue
    test_tok = re.sub(r'\W+', '', tok)
    pattern = re.compile(test_tok, re.IGNORECASE)
    
    for other_tok in tokens1:
      if len(other_tok) <= min_len: continue
      test_other_tok = re.sub(r'\W+', '', other_tok)
      if re.match(pattern, test_other_tok):
        all_matches.append(tok)

  return all_matches

    

In [None]:
def mask_df(df):
  causes_masked = []
  for i in range(df.shape[0]):
    causes_masked.append(mask(df.iloc[i].causes, df.name.iloc[i], 'DISEASE NAME'))

  df['causes_masked'] = pd.Series(causes_masked)
  return df


In [None]:
df = mask_df(df)

In [None]:
from collections import defaultdict

def aggro_mask_df(df, threshold = 10):
  word_counts = defaultdict(int)
  match_lists = []
  for i in range(df.shape[0]):
    matches = all_matches(df.iloc[i].description,df.iloc[i].causes_masked)
    for m in set(matches):
      word_counts[m] += 1
      match_lists.append(matches)
  
  for i in range(df.shape[0]):
    matches = match_lists[i]
    cur_causes = df.iloc[i].causes_masked
    for m in set(matches):
      if word_counts[m] >= threshold: continue
      else:
        cur_causes = mask(cur_causes, m, 'MASK')
        print(cur_causes)
    df.iloc[i].causes_masked = cur_causes


  return df
    

In [None]:
import nltk
import string
punct = set(string.punctuation)

nltk.download('words')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import words
from nltk.tokenize import word_tokenize

from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')
english = set(words.words())
english_plus = english.copy()
for word in english:
  english_plus.add(stemmer.stem(word))

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
def english_mask(text, corpus, mask="MASK", stemmer = stemmer):
  text = text.replace('-', ' ')
  tokens = word_tokenize(text)
  output_tokens = []
  for tok in tokens:
    new_tok = tok
    if tok not in corpus:
      if stemmer.stem(tok) not in corpus:
        if tok not in punct:
          new_tok = mask
    output_tokens.append(new_tok)
  new_text = " ".join(output_tokens)
  return new_text

In [None]:
df.causes_masked =  df.causes_masked.apply(lambda x: english_mask(x, english_plus))

In [None]:
df.iloc[0]['causes_masked']

'Mutations in MASK cause DISEASE NAME . This gene provides instructions for making a protein that releases cholesterol and phospholipids from cells . These substances are used to MASK , MASK MASK MASK in MASK prevent the release of cholesterol and phospholipids from cells . As a result , these substances accumulate within cells , causing certain body tissues to enlarge and the tonsils to acquire a yellowish orange color . A buildup of cholesterol can be toxic to cells , leading to impaired cell function or cell death . In addition , the inability to transport cholesterol and phospholipids out of cells results in very low MASK levels , which increases the risk of cardiovascular disease . These combined factors cause the signs and symptoms of DISEASE MASK more about the gene associated with DISEASE MASK'

In [None]:
df.to_csv("masked.csv", index = False)