<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Python-Setup" data-toc-modified-id="Python-Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Python Setup</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Patents-with-index-below-0" data-toc-modified-id="Patents-with-index-below-0-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Patents with index below 0</a></span></li><li><span><a href="#Patents-with-index-above-0" data-toc-modified-id="Patents-with-index-above-0-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Patents with index above 0</a></span></li><li><span><a href="#Find-which-verbs-and-nouns-are-unique-to-above_0-and-below_0-patents" data-toc-modified-id="Find-which-verbs-and-nouns-are-unique-to-above_0-and-below_0-patents-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Find which verbs and nouns are unique to above_0 and below_0 patents</a></span></li><li><span><a href="#Patents-with-index-1" data-toc-modified-id="Patents-with-index-1-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Patents with index 1</a></span></li><li><span><a href="#Patents-with-closer-to--1-index" data-toc-modified-id="Patents-with-closer-to--1-index-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Patents with closer to -1 index</a></span></li></ul></div>

### Python Setup

In [138]:
# Data manipulation

import pandas as pd
from collections import Counter

# POS tagging

from textblob import TextBlob
import nltk
import spacy
nlp = spacy.load('en_core_web_lg')  # this is the large model to train on: https://spacy.io/models/en

### Load data

In [4]:
text = pd.read_csv('patents_text_cdindex_i_2017y_gt_1000_20190901.csv', error_bad_lines=False)

b'Skipping line 8: expected 15 fields, saw 18\nSkipping line 38: expected 15 fields, saw 19\nSkipping line 65: expected 15 fields, saw 23\nSkipping line 99: expected 15 fields, saw 21\nSkipping line 101: expected 15 fields, saw 21\nSkipping line 121: expected 15 fields, saw 25\nSkipping line 122: expected 15 fields, saw 25\nSkipping line 131: expected 15 fields, saw 21\nSkipping line 138: expected 15 fields, saw 25\nSkipping line 144: expected 15 fields, saw 20\nSkipping line 148: expected 15 fields, saw 26\nSkipping line 151: expected 15 fields, saw 43\nSkipping line 153: expected 15 fields, saw 31\nSkipping line 156: expected 15 fields, saw 27\nSkipping line 160: expected 15 fields, saw 25\nSkipping line 161: expected 15 fields, saw 25\nSkipping line 172: expected 15 fields, saw 16\nSkipping line 177: expected 15 fields, saw 21\n'


In [6]:
text = text[['patent_number','title','abstract','cd_2017y','mcd_2017y']]

### Patents with index below 0 

In [201]:
text_below_0 = text[text['cd_2017y'] < 0]

In [202]:
# Use POS tagging in spacy

results_below_0 = []
for index,i in enumerate(text_below_0['title']):
    doc = nlp(i)
    for token in doc:
        results_below_0.append([index,token.lemma_, token.pos_])

In [203]:
results_below_0 = pd.DataFrame(results_below_0)

In [204]:
results_below_0.head()

Unnamed: 0,0,1,2
0,0,mesh,PROPN
1,0,tissue,NOUN
2,0,fastener,NOUN
3,1,disposable,ADJ
4,1,linear,ADJ


In [None]:
# Get all verbs 

results_below_0_verbs = results_below_0[results_below_0[2] == 'VERB']

In [152]:
# Count the number of unique verbs

count_below_0_verbs = Counter(results_below_0_verbs[1])

In [None]:
# A list of unique verbs

set_below_0_verbs = list(set(results_below_0_verbs[1]))

In [139]:
# Get all nouns

results_below_0_nouns = results_below_0[results_below_0[2] == 'NOUN']

In [151]:
# Count of unique nouns

count_below_0_nouns = Counter(results_below_0_nouns[1])

In [175]:
# A list of unique nouns

set_below_0_nouns = list(set(results_below_0_nouns[1]))

### Patents with index above 0

In [94]:
text_above_0 = text[text['cd_2017y'] > 0]

In [87]:
# Use POS tagging in spacy

results_above_0 = []
for index,i in enumerate(text_above_0['title']):
    doc = nlp(i)
    for token in doc:
        results_above_0.append([index,token.lemma_, token.pos_])

In [88]:
results_above_0 = pd.DataFrame(results_above_0)

In [205]:
results_above_0.head()

Unnamed: 0,0,1,2
0,0,process,NOUN
1,0,for,ADP
2,0,produce,VERB
3,0,porous,ADJ
4,0,product,NOUN


In [119]:
# Get all verbs

results_above_0_verbs = results_above_0[results_above_0[2] == 'VERB']

In [142]:
# Get all nouns

results_above_0_nouns = results_above_0[results_above_0[2] == 'NOUN']

In [146]:
# Get unique nouns

set_above_0_nouns = list(set(results_above_0_nouns[1]))

In [149]:
# Count unique nouns

count_above_0_nouns = Counter(results_above_0_nouns[1])

In [147]:
# Get unique verbs

set_above_0_verbs = list(set(results_above_0_verbs[1]))

In [150]:
# Count unique verbs

count_above_0_verbs = Counter(results_above_0_verbs[1])

### Find which verbs and nouns are unique to above_0 and below_0 patents

In [176]:
difference_nouns_in_above_0 = list(set(set_above_0_nouns) - set(set_below_0_nouns))

In [177]:
difference_nouns_in_below_0 = list(set(set_below_0_nouns) - set(set_above_0_nouns))

In [168]:
difference_verbs_in_above_0 = list(set(set_above_0_verbs) - set(set_below_0_verbs))

In [170]:
difference_verbs_in_below_0 = list(set(set_below_0_verbs) - set(set_above_0_verbs))

In [171]:
# Unique verbs in below_0

difference_verbs_in_below_0

['perform',
 'copyright',
 'simulate',
 'lh283btmon810',
 'cool',
 'ablate',
 'analyze',
 'bifurcate',
 'expand',
 'classify',
 'sense',
 'game',
 'utilize',
 'process',
 'trap',
 'end',
 'integrate',
 'monitor',
 'select',
 'obtain',
 'transmit',
 'stack',
 'manage']

In [169]:
# Unique verbs in above_0

difference_verbs_in_above_0

['incorporate',
 'deliver',
 'project',
 'win',
 'generate',
 'authenticate',
 'etch',
 'transfer',
 'interact',
 'design',
 'internetwork',
 'assist',
 'track',
 'present',
 'make',
 'manufacture',
 'read',
 'create',
 'form',
 'customize',
 'emulate',
 'facilitate',
 'target',
 'delay',
 'determine',
 'write',
 'attach',
 'balance',
 'nonwoven',
 'apply',
 'say',
 'define',
 'contact',
 'measure',
 'drive',
 'comprise',
 'bind',
 'link',
 'modify',
 'include']

### Patents with index 1

In [180]:
text_1 = text[text['cd_2017y'] == 1]

In [68]:
# POS tagging with textblob

results_1_textblob = []
for index,i in enumerate(text_1['title']):
    blob = TextBlob(i)
    for words, tag in blob.tags:
        results_1_textblob.append([index,words,tag])

In [69]:
results_1_textblob

[[0, 'Process', 'NN'],
 [0, 'for', 'IN'],
 [0, 'producing', 'VBG'],
 [0, 'porous', 'JJ'],
 [0, 'products', 'NNS'],
 [1, 'Nonwoven', 'NNP'],
 [1, 'fabric', 'NN'],
 [1, 'and', 'CC'],
 [1, 'method', 'NN'],
 [1, 'of', 'IN'],
 [1, 'producing', 'VBG'],
 [1, 'same', 'JJ'],
 [2, 'Cryptographic', 'NNP'],
 [2, 'communications', 'NNS'],
 [2, 'system', 'NN'],
 [2, 'and', 'CC'],
 [2, 'method', 'NN'],
 [3, 'Process', 'NN'],
 [3, 'for', 'IN'],
 [3, 'amplifying', 'VBG'],
 [3, 'nucleic', 'JJ'],
 [3, 'acid', 'NN'],
 [3, 'sequences', 'NNS'],
 [4, 'Inbred', 'NNP'],
 [4, 'corn', 'NN'],
 [4, 'line', 'NN'],
 [4, 'PHT47', 'NNP'],
 [5, 'Mutant', 'JJ'],
 [5, 'dwarfism', 'NN'],
 [5, 'gene', 'NN'],
 [5, 'of', 'IN'],
 [5, 'petunia', 'NN'],
 [6, 'Inbred', 'NNP'],
 [6, 'maize', 'MD'],
 [6, 'line', 'NN'],
 [6, 'PH0HC', 'VB'],
 [7, 'Oxide', 'NNP'],
 [7, 'thin', 'JJ'],
 [7, 'film', 'NN'],
 [8, 'Field-effect', 'JJ'],
 [8, 'transistor', 'NN'],
 [8, 'and', 'CC'],
 [8, 'method', 'NN'],
 [8, 'for', 'IN'],
 [8, 'manufacturin

In [125]:
results_1 = []
for index,i in enumerate(text_1['title']):
    doc = nlp(i)
    for token in doc:
        results_1.append([index,token.lemma_, token.pos_,token.tag_])

In [126]:
results_1 = pd.DataFrame(results_1)

In [128]:
# Get all verbs 

results_1_verbs = results_1[results_1[2] == 'VERB']

In [129]:
# Count unique verbs

Counter(results_1_verbs[1])

Counter({'produce': 2, 'nonwoven': 1, 'amplify': 1, 'manufacture': 1})

In [130]:
# Get unique verbs

set(results_1_verbs[1])

{'amplify', 'manufacture', 'nonwoven', 'produce'}

In [206]:
results_1_nouns = results_1[results_1[2] == 'NOUN']

### Patents with closer to -1 index

In [34]:
text_minus_1 = text[text['cd_2017y'] < -0.8]

In [207]:
# POS tagging with spacy

results_minus_1 = []
for index,i in enumerate(text_minus_1['title']):
    doc = nlp(i)
    for token in doc:
        results_minus_1.append([index,token.lemma_, token.pos_])

In [132]:
results_minus_1 = pd.DataFrame(results_minus_1)

In [134]:
# Get all verbs

results_minus_1_verbs = results_minus_1[results_minus_1[2] == 'VERB']

In [136]:
# Count unique verbs

Counter(results_minus_1_verbs[1])

Counter({'use': 1, 'fabricate': 1})

In [137]:
# Get unique verbs

set(results_minus_1_verbs[1])

{'fabricate', 'use'}

In [208]:
# Get unique nouns

results_minus_1_nouns = results_minus_1[results_minus_1[2] == 'NOUN']