## text-analysis-patent-titles-4-Pfizer

### Load all patents from patent extraction - Pfizer

In [3]:
import numpy as np
import pandas as pd

df_patents = pd.read_csv('/Users/stevehansen/Desktop/Data_Science_iPython_Chicago/working-directory/output_pfizer.csv')
df_titles = df_patents['title']

###  Parse titles and print results.  


In [4]:
title_word = [words for segments in df_titles for words in segments.split()]
title_word = [element.lower() for element in title_word]
#print (title_word)

### Remove stop words from patent titles. 

In [5]:
# Make a list of the words you want remove.  Remove non-chemical stop words.  Method from:  
# https://gist.githubusercontent.com/glenbot/4684356/raw/e55ff0ed411763d5db28043b739f4fe2f44d1b69/remove_stop_words.py
# for methods of removing stop words. 

stop_words = ['and', 'or', 'not', 'treatment', 'of', 'for', 'derivatives', 'novel', 'production', 'composition',
              'process', 'the', 'methods', 'product', 'compounds', 'thereof', 'related', 'compositions',
              'related', 'prepared', 'certain', 'use', 'used', 'uses', 'thereof', 'method', 'therefor', 'vectors',
              'agent', 'agents', 'preparing', 'intermediates', 'its', 'having', 'system', 'from', 'coupled' 
              'permissibly-substituted', 'levels', 'release', 'therefore', 'activity', 'pharmaceutical',
              'controlled', 'pharmacologically', 'as', 'in', 'to', 'thereby', 'a', 'by', 'lower', 'host',
              'storage', 'binding', 'coupled', 'isolation', 'organism', 'resistance', 'solution', 'synthetic',
              'inverse', '  pharmaceutically', 'treating', 'stable', 'small', 'with', 'cells', 'reset', 'plastic',
              'dual', 'treat', 'hindered', 'conferring', 'response', 'using', 'prodrug', 'drive', 'disorders',
              'stabilizing', 'cell', 'humanized', 'stabilizing', 'agents', 'responsiveness', 'provokes', 'signaling',
              'module', 'notch', 'modulating', 'automatic', 'peripheral', 'analog','analogue','co-',
              'administration', 'producing', 'useful', 'bind', 'delay', 'their', 'airborne', 'restore','substituted',
              'permissibly', 'and...', 'substituted', 'an', 'at', 'making', 'selective', 'healing', 'that', 
              'based', 'them', 'create', 'comprising', 'diseases', 'disease', 'wound', 'growth', 'pain', 'same', 
              'forms', 'advantage', 'with', 'late', 'paper', 'thermal', 'optical', 'cleaning', 
             ]

def sanitize_1(user_input, stop_words):
    """Sanitize using set subtraction then wrapped in list()"""
    return list(set(user_input) - set(stop_words))

cleaned_text = sanitize_1 (title_word, stop_words)
#print (cleaned_text)


### Use ntlk stemmer. 

In [6]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def stem_words(content):
    stemmed_words = [stemmer.stem(word) for word in str(cleaned_text).split()]
    return " ".join(stemmed_words)

#stem_words('cleaned_text')

### Apply count vectorizer.

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(cleaned_text)
counts.shape

print (counts)

  (0, 1949)	1
  (1, 1976)	1
  (2, 1248)	1
  (3, 1303)	1
  (4, 1158)	1
  (5, 1466)	1
  (5, 23)	1
  (5, 1758)	1
  (6, 1537)	1
  (6, 60)	1
  (6, 85)	1
  (6, 173)	1
  (7, 1460)	1
  (7, 1669)	1
  (7, 2159)	1
  (7, 2101)	1
  (7, 291)	1
  (7, 2097)	1
  (8, 337)	1
  (9, 1145)	1
  (10, 1671)	1
  (11, 2122)	1
  (12, 1185)	1
  (13, 502)	1
  (13, 1358)	1
  :	:
  (2287, 1433)	1
  (2288, 404)	1
  (2289, 320)	1
  (2290, 2114)	1
  (2292, 1480)	1
  (2293, 1518)	1
  (2294, 75)	1
  (2295, 1918)	1
  (2296, 218)	1
  (2297, 136)	1
  (2297, 1349)	1
  (2298, 2065)	1
  (2299, 1180)	1
  (2300, 1938)	1
  (2300, 1091)	1
  (2301, 1385)	1
  (2302, 1544)	1
  (2303, 636)	1
  (2304, 1472)	1
  (2305, 120)	1
  (2306, 1003)	1
  (2306, 158)	1
  (2307, 215)	1
  (2308, 1864)	1
  (2309, 1227)	1


 We can use the tf_idf transformer to take our counts and make it into a Document X Term Matrix with tfidf values in the cells.

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True).fit(counts)
tf_idf = tf_transformer.transform(counts)

print (tf_idf)

  (0, 1949)	1.0
  (1, 1976)	1.0
  (2, 1248)	1.0
  (3, 1303)	1.0
  (4, 1158)	1.0
  (5, 1758)	0.656415387631
  (5, 23)	0.566857491436
  (5, 1466)	0.497786523807
  (6, 173)	0.370503273102
  (6, 85)	0.568382570961
  (6, 60)	0.519455762144
  (6, 1537)	0.519455762144
  (7, 2097)	0.44269318026
  (7, 291)	0.332453236657
  (7, 2101)	0.484389829279
  (7, 2159)	0.319513093571
  (7, 1669)	0.44269318026
  (7, 1460)	0.40099653124
  (8, 337)	1.0
  (9, 1145)	1.0
  (10, 1671)	1.0
  (11, 2122)	1.0
  (12, 1185)	1.0
  (13, 1358)	0.725127607905
  (13, 502)	0.688614516441
  :	:
  (2287, 1433)	1.0
  (2288, 404)	1.0
  (2289, 320)	1.0
  (2290, 2114)	1.0
  (2292, 1480)	1.0
  (2293, 1518)	1.0
  (2294, 75)	1.0
  (2295, 1918)	1.0
  (2296, 218)	1.0
  (2297, 1349)	0.584526945819
  (2297, 136)	0.811374296864
  (2298, 2065)	1.0
  (2299, 1180)	1.0
  (2300, 1091)	0.56313661452
  (2300, 1938)	0.82636381418
  (2301, 1385)	1.0
  (2302, 1544)	1.0
  (2303, 636)	1.0
  (2304, 1472)	1.0
  (2305, 120)	1.0
  (2306, 158)	0.6644920

###  Use Latent Dirichlet Allocation (LDA) example

In [9]:
from sklearn.decomposition import LatentDirichletAllocation

n_samples = 1000
n_features = 60

# In this model, we need to explicitly specify the number of topics we want the model to uncover.

n_topics = 25
n_top_words = 40

dataset = pd.DataFrame(cleaned_text)
print (dataset)
data_samples = dataset[:n_samples]


vectorizer = CountVectorizer(max_df=.90, min_df=2,max_features=n_features)
tf = vectorizer.fit_transform(cleaned_text)
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1,learning_method='online', random_state=0)
tf_feature_names = vectorizer.get_feature_names()

print (lda.fit(tf))

                                                      0
0                                      sulfonamidoalkyl
1                                            sweetening
2                                             japonicus
3                                              magnetic
4                                              implants
5                                  quinazolin-4(3h)-one
6     6'-(2-amino-2-[4-acyloxyphenyl]acetamido)penic...
7     1-triazolyl-2-aryl-3-(5-trifluoromethylimidazo...
8                                   azahomoerythromycin
9                                    imidazolylpyridine
10                                         propanediols
11                                               unroll
12                                         infestations
13                   3-methylthiophene-2-carboxaldehyde
14                                               trauma
15                                 hypercholesterolemia
16                                           ery

In [10]:
tf_feature_names = vectorizer.get_feature_names()

# lda.components_ is stored as an unnormalized probability distribution, meaning the sum of the probabilities 
# of the words isn't 1 for the topics, but we can ensure the sum is 1 by dividing each element by its row's sum.  

normalized_components = lda.components_ / lda.components_.sum(axis=1)[:,None]

for topic_idx, topic in enumerate(normalized_components):
    print ("Topic #%d:" % topic_idx)
    topic_words = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]
    print (" ".join(topic_words))
  

Topic #0:
pyrido fluorophenyl spiro 1h fluoro tetrahydro aryl benzyl 2h 11 bis carbonate carboxylic pyridines chloro piperazinyl oxo alkyl imidazolyl pyrimidine benzo penicillanic imidazoles amino gamma alkoxy non anti methyl quinolines alpha ones acyl piperidino aza disubstituted one heteroaryl pyridine triazol
Topic #1:
dihydropyridine pyridine fluoro 11 aza dihydro pyridines anti chloro quinolines alkoxy heterocyclic 1h gamma acid imidazoles beta spiro carboxamides one acyl benzo piperazinyl bis alkyl hydroxy amino triazol phenyl carboxamide oxo substituted cis yl piperidino pyrimidine 2h non pyrido aryl
Topic #2:
heterocyclic acid tetrahydro yl penicillanic acyl methyl ol ones carboxylic chloro 11 piperazinyl beta dione substituted dihydropyridine carbonate bis disubstituted piperidino cis hydroxyoctahydrobenzo alpha benzo hydroxy 1h pyridines anti gamma amino heteroaryl alkoxy fluoro triazol imidazoles pyrido carboxamide carboxamides guanidino
Topic #3:
aryl yl triazol 1h tetrahyd

Sum of probabilities.

In [11]:
# Sum of probabilities. 

for n in range(30):
    sum_pr = sum(counts[n,:])
    print("document: {} sum: {}".format(n, sum_pr))

document: 0 sum:   (0, 1949)	1
document: 1 sum:   (0, 1976)	1
document: 2 sum:   (0, 1248)	1
document: 3 sum:   (0, 1303)	1
document: 4 sum:   (0, 1158)	1
document: 5 sum:   (0, 1466)	1
  (0, 23)	1
  (0, 1758)	1
document: 6 sum:   (0, 1537)	1
  (0, 60)	1
  (0, 85)	1
  (0, 173)	1
document: 7 sum:   (0, 1460)	1
  (0, 1669)	1
  (0, 2159)	1
  (0, 2101)	1
  (0, 291)	1
  (0, 2097)	1
document: 8 sum:   (0, 337)	1
document: 9 sum:   (0, 1145)	1
document: 10 sum:   (0, 1671)	1
document: 11 sum:   (0, 2122)	1
document: 12 sum:   (0, 1185)	1
document: 13 sum:   (0, 502)	1
  (0, 1358)	1
document: 14 sum:   (0, 2083)	1
document: 15 sum:   (0, 1122)	1
document: 16 sum:   (0, 873)	1
document: 17 sum:   (0, 1071)	1
document: 18 sum:   (0, 79)	1
document: 19 sum:   (0, 1587)	1
document: 20 sum:   (0, 1467)	1
  (0, 23)	1
document: 21 sum:   (0, 352)	1
  (0, 1021)	1
  (0, 1890)	1
document: 22 sum:   (0, 1611)	1
document: 23 sum:   (0, 1285)	1
document: 24 sum:   (0, 364)	1
document: 25 sum:   (0, 114)	1


In [12]:
chemwords = [x.encode('UTF8') for x in topic_words]

for n in range (30):
    topic_most_pr = counts[n].argmax()
    print("doc: {} topic: {}\n{}...".format(n, topic_most_pr, chemwords[n][:60]))

doc: 0 topic: 1949
alpha...
doc: 1 topic: 1976
disubstituted...
doc: 2 topic: 1248
fluorophenyl...
doc: 3 topic: 1303
pyrimidine...
doc: 4 topic: 1158
cis...
doc: 5 topic: 23
ethyl...
doc: 6 topic: 60
substituted...
doc: 7 topic: 291
chloro...
doc: 8 topic: 337
penicillanic...
doc: 9 topic: 1145
non...
doc: 10 topic: 1671
heteroaryl...
doc: 11 topic: 2122
amino...
doc: 12 topic: 1185
oxindole...
doc: 13 topic: 502
diones...
doc: 14 topic: 2083
triazol...
doc: 15 topic: 1122
pyrido...
doc: 16 topic: 873
gamma...
doc: 17 topic: 1071
pyridines...
doc: 18 topic: 79
imidazolyl...
doc: 19 topic: 1587
ones...
doc: 20 topic: 23
one...
doc: 21 topic: 352
guanidino...
doc: 22 topic: 1611
spiro...
doc: 23 topic: 1285
hydroxyoctahydrobenzo...
doc: 24 topic: 364
acyl...
doc: 25 topic: 114
heterocyclic...
doc: 26 topic: 562
fluoro...
doc: 27 topic: 1477
benzyl...
doc: 28 topic: 423
phenyl...
doc: 29 topic: 618
methyl...
