In [111]:
from sklearn.feature_extraction.text import CountVectorizer

train_text = ['A bird in the hand is worth two in the bush. ' + \
'GOOD things comes to those to wait. ' + \
'These watches cost $255. ' + \
'Mr.Smith gies to Washington. ' + \
'Doogle Broswer M.D.']

In [112]:
count_vectorizer = CountVectorizer()

In [113]:
count_vectorizer.fit(train_text)

CountVectorizer()

In [114]:
count_vectorizer.get_feature_names()

['255',
 'bird',
 'broswer',
 'bush',
 'comes',
 'cost',
 'doogle',
 'gies',
 'good',
 'hand',
 'in',
 'is',
 'mr',
 'smith',
 'the',
 'these',
 'things',
 'those',
 'to',
 'two',
 'wait',
 'washington',
 'watches',
 'worth']

In [115]:
count_vectorizer.get_stop_words()

In [116]:
count_vectorizer.vocabulary_

{'bird': 1,
 'in': 10,
 'the': 14,
 'hand': 9,
 'is': 11,
 'worth': 23,
 'two': 19,
 'bush': 3,
 'good': 8,
 'things': 16,
 'comes': 4,
 'to': 18,
 'those': 17,
 'wait': 20,
 'these': 15,
 'watches': 22,
 'cost': 5,
 '255': 0,
 'mr': 12,
 'smith': 13,
 'gies': 7,
 'washington': 21,
 'doogle': 6,
 'broswer': 2}

In [117]:
count_vectorizer.vocabulary_.get('doogle')

6

In [118]:
# Lets us use Bag of Words representation of Text data
train_text

['A bird in the hand is worth two in the bush. GOOD things comes to those to wait. These watches cost $255. Mr.Smith gies to Washington. Doogle Broswer M.D.']

In [119]:
transformed_vector = count_vectorizer.transform(train_text)

In [120]:
print(transformed_vector)

  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1
  (0, 8)	1
  (0, 9)	1
  (0, 10)	2
  (0, 11)	1
  (0, 12)	1
  (0, 13)	1
  (0, 14)	2
  (0, 15)	1
  (0, 16)	1
  (0, 17)	1
  (0, 18)	3
  (0, 19)	1
  (0, 20)	1
  (0, 21)	1
  (0, 22)	1
  (0, 23)	1


In [121]:
transformed_vector.shape

(1, 24)

In [122]:
print(transformed_vector.toarray())

[[1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 3 1 1 1 1 1]]


In [123]:
test_text = ['Every CLoud has a silver linig.']
count_vectorizer.transform(test_text).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]], dtype=int64)

In [124]:
count_vectorizer.fit(train_text + test_text)

CountVectorizer()

In [125]:
print(count_vectorizer.vocabulary_)

{'bird': 1, 'in': 13, 'the': 19, 'hand': 11, 'is': 14, 'worth': 28, 'two': 24, 'bush': 3, 'good': 10, 'things': 21, 'comes': 5, 'to': 23, 'those': 22, 'wait': 25, 'these': 20, 'watches': 27, 'cost': 6, '255': 0, 'mr': 16, 'smith': 18, 'gies': 9, 'washington': 26, 'doogle': 7, 'broswer': 2, 'every': 8, 'cloud': 4, 'has': 12, 'silver': 17, 'linig': 15}


In [126]:
count_vectorizer.transform(test_text).toarray()

array([[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [127]:
text =  ['A bird in the hand is worth two in the bush. ' + \
         'GOOD things comes to those to wait. ' + \
         'Watches are cool']
transformed_vector = count_vectorizer.transform(text)
transformed_vector

<1x29 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [128]:
print(transformed_vector)

  (0, 1)	1
  (0, 3)	1
  (0, 5)	1
  (0, 10)	1
  (0, 11)	1
  (0, 13)	2
  (0, 14)	1
  (0, 19)	2
  (0, 21)	1
  (0, 22)	1
  (0, 23)	2
  (0, 24)	1
  (0, 25)	1
  (0, 27)	1
  (0, 28)	1


In [129]:
transformed_vector.shape

(1, 29)

In [130]:
print(transformed_vector.toarray())

[[0 1 0 1 0 1 0 0 0 0 1 1 0 2 1 0 0 0 0 2 0 1 1 2 1 1 0 1 1]]


In [131]:
# Generating n-grams is very simillar to that of normal count vectorixer
train_text

['A bird in the hand is worth two in the bush. GOOD things comes to those to wait. These watches cost $255. Mr.Smith gies to Washington. Doogle Broswer M.D.']

In [132]:
text

['A bird in the hand is worth two in the bush. GOOD things comes to those to wait. Watches are cool']

In [133]:
# We can use n-grams from SKlearn and pass with 
# two or three as n-gram values or we can use NLTK to use bo-gram Tri-grams

In [134]:
import nltk
from nltk import bigrams
from nltk import trigrams
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

word_tokens = word_tokenize(' '.join(train_text))
word_tokens

['A',
 'bird',
 'in',
 'the',
 'hand',
 'is',
 'worth',
 'two',
 'in',
 'the',
 'bush',
 '.',
 'GOOD',
 'things',
 'comes',
 'to',
 'those',
 'to',
 'wait',
 '.',
 'These',
 'watches',
 'cost',
 '$',
 '255',
 '.',
 'Mr.Smith',
 'gies',
 'to',
 'Washington',
 '.',
 'Doogle',
 'Broswer',
 'M.D',
 '.']

In [135]:
nltk_bigrams = bigrams(word_tokens)
list(nltk_bigrams)

TypeError: 'generator' object is not callable

In [136]:
nltk_trigrams = trigrams(word_tokens)
list(nltk_trigrams)

TypeError: 'generator' object is not callable

In [137]:
nltk.__version__

'3.5'

In [138]:
from nltk import ngrams

fivegrams = ngrams(word_tokens, 5)
for grams in fivegrams:
    print(grams)

('A', 'bird', 'in', 'the', 'hand')
('bird', 'in', 'the', 'hand', 'is')
('in', 'the', 'hand', 'is', 'worth')
('the', 'hand', 'is', 'worth', 'two')
('hand', 'is', 'worth', 'two', 'in')
('is', 'worth', 'two', 'in', 'the')
('worth', 'two', 'in', 'the', 'bush')
('two', 'in', 'the', 'bush', '.')
('in', 'the', 'bush', '.', 'GOOD')
('the', 'bush', '.', 'GOOD', 'things')
('bush', '.', 'GOOD', 'things', 'comes')
('.', 'GOOD', 'things', 'comes', 'to')
('GOOD', 'things', 'comes', 'to', 'those')
('things', 'comes', 'to', 'those', 'to')
('comes', 'to', 'those', 'to', 'wait')
('to', 'those', 'to', 'wait', '.')
('those', 'to', 'wait', '.', 'These')
('to', 'wait', '.', 'These', 'watches')
('wait', '.', 'These', 'watches', 'cost')
('.', 'These', 'watches', 'cost', '$')
('These', 'watches', 'cost', '$', '255')
('watches', 'cost', '$', '255', '.')
('cost', '$', '255', '.', 'Mr.Smith')
('$', '255', '.', 'Mr.Smith', 'gies')
('255', '.', 'Mr.Smith', 'gies', 'to')
('.', 'Mr.Smith', 'gies', 'to', 'Washington')

In [139]:
# To measure / score to determine whether occurence of Bigram is meaningful
from nltk.collocations import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures
from nltk.collocations import TrigramCollocationFinder

In [140]:
with open('textdoc.txt', 'r') as f:
    file_contents = f.read()
print(file_contents)

1) Why does NVDIMM doesn't support compute nodes?
2) Big and Small Controllers differences 
3) No need of 105, 106 VLAN's in our RHEV deployment appropriate VLAN's needs to mentioned in the Legend
4) 
Installing and Configuring DNS on RHEL

Step 1: Installing BIND DNS
1.	Install Bind DNS and its utilities on rhel server
# yum install bind bind-utils
2.	Start the DNS service (named daemon) for now, then enable it to auto-start at system boot and check if it is up and running using the below commands
# systemctl start named
# systemctl enable named
#systemctl status named
Step 2: Configuring BIND DNS
3.	To configure Bind DNS server, first we need to take a backup of the original configuration file /etc/named.conf using following cp command.
# cp /etc/named.conf /etc/named.conf.orig
4.	Open /etc/named.conf configuration file for editing as follows
# vi /etc/named.conf
5.	Under the options configuration section, comment out the following lines.
                
6.	Now look for the allow-qu

In [None]:
word_tokens = word_tokenize(file_contents)

In [143]:
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(word_tokens)
finder.apply_freq_filter(3)

In [145]:
matches = finder.nbest(bigram_measures.raw_freq, 10)
matches

[]