In [1]:
import os, sys, re
import json
from watson_developer_cloud import AlchemyLanguageV1
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from com.ibm.watson import DESKTOP, DOCS_DIR

In [2]:
base_dir = os.path.join(DESKTOP, "Concept_Detection")

In [3]:
stop_words = sum([ line.strip().split(", ") for line in open("/Users/singhv/Documents/Data/DL/pubmed_stopwords.txt") ], [])

In [4]:
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), stop_words=stop_words)
classifier = MultinomialNB()
text_clf = Pipeline([ ('vect', vectorizer), ('clf', classifier) ])

In [5]:
df = pd.read_csv(os.path.join(base_dir, "datasheet_for_CUI_code_concept_detection.csv"))

In [6]:
print len(df)
print df.columns

496362
Index([u'CUI', u'NCI_CODES', u'PCUI', u'P_NCI_CODES', u'STR', u'SAB', u'STY',
       u'SYNS'],
      dtype='object')


In [7]:
grouped = df.groupby(['STY'])
#print grouped.groups.keys()

In [8]:
le = LabelEncoder()
le.fit(grouped.groups.keys())
#print le.classes_
labels = le.transform(df["STY"])
#print labels

In [9]:
print le.inverse_transform([1,2, 30])
print labels[1734]

['Activity' 'Age Group' 'Chemical Viewed Structurally']
30


In [24]:
df[df["STY"].isin(['Animal'])]

Unnamed: 0,CUI,NCI_CODES,PCUI,P_NCI_CODES,STR,SAB,STY,SYNS
809,C0003062,C14182,C0029235,C14250,Animal,NCI,Animal,Animal| Animal| Animals| Animals| Animalia| Ki...
810,C0003063,C117982,C1518665,C14376,Domestic Animal,NCI,Animal,Domestic Animal
811,C0003064,C14183,C1517710,C14354,Laboratory Animal,NCI,Animal,Laboratory Animal
812,C0003069,C14184,C0003064,C14183,Transgenic Animal,NCI,Animal,"Transgenic Animal| Animals, Transgenic| Transg..."
3891,C0012656,C14198,C0314732,C14316,Disease Vector,NCI,Animal,Disease Vector| Disease Vectors| Vector (Infec...
183930,C0562690,,C0562623,,Carnivore,SNOMEDCT_US,Animal,Carnivore| Carnivore (organism)
183931,C0562691,,C0562623,,Herbivore,SNOMEDCT_US,Animal,Herbivore| Herbivore (organism)
183932,C0562693,,C0562623,,Omnivore,SNOMEDCT_US,Animal,Omnivore| Omnivore (organism)
183933,C0562694,,C0562691,,Frugivore,SNOMEDCT_US,Animal,Frugivore| Fructivore| Frugivore (organism)
208414,C0599779,C71164,C1515657,C19148,Animal Model,NCI,Animal,"Animal Model| Animal Model, Generic| Research ..."


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, train_size = 0.8, stratify = df["STY"])

In [13]:
text_clf.fit(df["STR"], labels)

Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [25]:
cls = text_clf.predict(["lobular carcinoma"])
print cls
print le.inverse_transform([cls])

[88]
[['Neoplastic Process']]


In [27]:
grouped.size().sort_values()

STY
Vertebrate                                  1
Carbohydrate Sequence                       2
Molecular Sequence                          3
Fully Formed Anatomical Structure           5
Chemical                                    6
Self-help or Relief Organization            8
Entity                                     11
Professional Society                       17
Animal                                     17
Group                                      20
Behavior                                   20
Environmental Effect of Humans             21
Group Attribute                            26
Physical Object                            35
Biologic Function                          40
Research Device                            41
Age Group                                  45
Patient or Disabled Group                  50
Organism                                   62
Experimental Model of Disease              85
Nucleotide Sequence                        87
Human                         