### Read entire text as string

with open('One.txt') as mytext:
    a = mytext.read()
    print(a)

In [3]:
with open('Two.txt') as mytext:
    b = mytext.read()
    print(b)

This story is about surfing
Catching waves is fun
Surfing is a popular water sport



In [4]:
a

'This is a story about dogs\nour canine pets\nDogs are furry animals\n'

In [5]:
with open('Two.txt') as mytext:
    b1 = mytext.readlines()

### Read entire text as list

In [6]:
b1

['This story is about surfing\n',
 'Catching waves is fun\n',
 'Surfing is a popular water sport\n']

### Read in words separately

In [13]:
with open('One.txt') as mytext:
    a1 = mytext.read().lower().split()


In [14]:
a1

['this',
 'is',
 'a',
 'story',
 'about',
 'dogs',
 'our',
 'canine',
 'pets',
 'dogs',
 'are',
 'furry',
 'animals']

### Build a Vocabulary (Create a 'Bag of Words')

In [15]:
with open('One.txt') as mytext:
    word_one = mytext.read().lower().split()


In [16]:
word_one

['this',
 'is',
 'a',
 'story',
 'about',
 'dogs',
 'our',
 'canine',
 'pets',
 'dogs',
 'are',
 'furry',
 'animals']

In [17]:
len(word_one)

13

In [18]:
uni_words_one = set(word_one)

In [19]:
uni_words_one

{'a',
 'about',
 'animals',
 'are',
 'canine',
 'dogs',
 'furry',
 'is',
 'our',
 'pets',
 'story',
 'this'}

In [20]:
len(uni_words_one)

12

In [21]:
with open('Two.txt') as mytext:
    word_two = mytext.read().lower().split()
    uni_words_two = set(word_two)


In [22]:
uni_words_two

{'a',
 'about',
 'catching',
 'fun',
 'is',
 'popular',
 'sport',
 'story',
 'surfing',
 'this',
 'water',
 'waves'}

### Get all the unique words across all the documents

In [24]:
all_uni_words = set()
all_uni_words.update(uni_words_one)
all_uni_words.update(uni_words_two)

In [25]:
all_uni_words

{'a',
 'about',
 'animals',
 'are',
 'canine',
 'catching',
 'dogs',
 'fun',
 'furry',
 'is',
 'our',
 'pets',
 'popular',
 'sport',
 'story',
 'surfing',
 'this',
 'water',
 'waves'}

In [27]:
full_vocab = dict()
i = 0

for w in all_uni_words:
    full_vocab[w] = i
    i = i+1

In [28]:
full_vocab

{'animals': 0,
 'popular': 1,
 'furry': 2,
 'a': 3,
 'this': 4,
 'our': 5,
 'catching': 6,
 'surfing': 7,
 'pets': 8,
 'dogs': 9,
 'are': 10,
 'story': 11,
 'is': 12,
 'waves': 13,
 'fun': 14,
 'water': 15,
 'sport': 16,
 'about': 17,
 'canine': 18}

### Bag of Words to Frequency Counts

In [30]:
one_freq = [0]*len(full_vocab)
two_freq = [0]*len(full_vocab)
all_words = ['']*len(full_vocab)

In [31]:
all_words

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']

In [34]:
for w in full_vocab:
    word_ind = full_vocab[w]
    all_words[word_ind] = w

In [35]:
word_ind

18

In [36]:
all_words

['animals',
 'popular',
 'furry',
 'a',
 'this',
 'our',
 'catching',
 'surfing',
 'pets',
 'dogs',
 'are',
 'story',
 'is',
 'waves',
 'fun',
 'water',
 'sport',
 'about',
 'canine']

### Add in counts per word per doc

In [37]:
with open('One.txt') as f:
    one_text = f.read().lower().split()

In [38]:
one_text

['this',
 'is',
 'a',
 'story',
 'about',
 'dogs',
 'our',
 'canine',
 'pets',
 'dogs',
 'are',
 'furry',
 'animals']

In [39]:
for w in one_text:
    word_ind = full_vocab[w]
    one_freq[word_ind] += 1

In [40]:
one_freq

[1, 0, 1, 1, 1, 1, 0, 0, 1, 2, 1, 1, 1, 0, 0, 0, 0, 1, 1]

In [42]:
all_words

['animals',
 'popular',
 'furry',
 'a',
 'this',
 'our',
 'catching',
 'surfing',
 'pets',
 'dogs',
 'are',
 'story',
 'is',
 'waves',
 'fun',
 'water',
 'sport',
 'about',
 'canine']

In [43]:
with open('Two.txt') as f:
    two_text = f.read().lower().split()
for w in two_text:
    word_ind = full_vocab[w]
    two_freq[word_ind] += 1

In [44]:
two_freq

[0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 3, 1, 1, 1, 1, 1, 0]

In [45]:
two_text

['this',
 'story',
 'is',
 'about',
 'surfing',
 'catching',
 'waves',
 'is',
 'fun',
 'surfing',
 'is',
 'a',
 'popular',
 'water',
 'sport']

In [46]:
import pandas as pd

pd.DataFrame(data=[one_freq,two_freq],columns=all_words)

Unnamed: 0,animals,popular,furry,a,this,our,catching,surfing,pets,dogs,are,story,is,waves,fun,water,sport,about,canine
0,1,0,1,1,1,1,0,0,1,2,1,1,1,0,0,0,0,1,1
1,0,1,0,1,1,0,1,2,0,0,0,1,3,1,1,1,1,1,0


### Sklearn's Text Feature Extraction

In [47]:
text = ['This is a line', 
       'This is another line',
       'Completely different line']

In [48]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

In [49]:
cv = CountVectorizer()

In [50]:
cv.fit_transform(text)

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [51]:
sparse_mat = cv.fit_transform(text)
sparse_mat.todense()

matrix([[0, 0, 0, 1, 1, 1],
        [1, 0, 0, 1, 1, 1],
        [0, 1, 1, 0, 1, 0]])

In [52]:
cv.vocabulary_

{'this': 5, 'is': 3, 'line': 4, 'another': 0, 'completely': 1, 'different': 2}

In [54]:
cv1 = CountVectorizer(stop_words='english')
cv1.fit_transform(text).todense()

matrix([[0, 0, 1],
        [0, 0, 1],
        [1, 1, 1]])

In [55]:
cv1.vocabulary_

{'line': 2, 'completely': 0, 'different': 1}

# TFIDF transformer

In [56]:
tfidf_transformer = TfidfTransformer()

cv = CountVectorizer()

In [58]:
counts = cv.fit_transform(text)

In [60]:
tfidf = tfidf_transformer.fit_transform(counts)
tfidf.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.48133417,
         0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.37311881,
         0.4804584 ],
        [0.        , 0.65249088, 0.65249088, 0.        , 0.38537163,
         0.        ]])

In [61]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('cv',CountVectorizer()),('tfidf',TfidfTransformer())])

results = pipe.fit_transform(text)

In [62]:
results.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.48133417,
         0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.37311881,
         0.4804584 ],
        [0.        , 0.65249088, 0.65249088, 0.        , 0.38537163,
         0.        ]])

#### TFIDF Vectorizer

In [63]:
tfidf = TfidfVectorizer()

In [65]:
new = tfidf.fit_transform(text)

In [66]:
new.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.48133417,
         0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.37311881,
         0.4804584 ],
        [0.        , 0.65249088, 0.65249088, 0.        , 0.38537163,
         0.        ]])