In [2]:
import string
import pandas as pd
import pprint
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
documents = ["So, so you think you can tell",
"Heaven from hell?",
"Blue skies from pain?",
"Can you tell a green field",
"From a cold steel rail?",
"A smile from a veil?",
"Do you think you can tell?"]

print(documents)

['So, so you think you can tell', 'Heaven from hell?', 'Blue skies from pain?', 'Can you tell a green field', 'From a cold steel rail?', 'A smile from a veil?', 'Do you think you can tell?']


In [4]:
removed_punctuation_documents = []
for i in documents:
    removed_punctuation_documents.append(''.join(c for c in i if c not in string.punctuation))  
print(removed_punctuation_documents)

['So so you think you can tell', 'Heaven from hell', 'Blue skies from pain', 'Can you tell a green field', 'From a cold steel rail', 'A smile from a veil', 'Do you think you can tell']


In [5]:
clean_document = []
ignore = ['a','A', "the", "is","from"]
for i in removed_punctuation_documents:
    words = re.sub("[^\w]", " ",  i).split()
    cleaned_text = [w.lower() for w in words if w not in ignore]
    clean_document.append(cleaned_text)
print(clean_document)

[['so', 'so', 'you', 'think', 'you', 'can', 'tell'], ['heaven', 'hell'], ['blue', 'skies', 'pain'], ['can', 'you', 'tell', 'green', 'field'], ['from', 'cold', 'steel', 'rail'], ['smile', 'veil'], ['do', 'you', 'think', 'you', 'can', 'tell']]


In [6]:
frequency_list = []
for i in clean_document:
    frequency_list.append(Counter(i))
    
pprint.pprint(frequency_list)

[Counter({'so': 2, 'you': 2, 'think': 1, 'can': 1, 'tell': 1}),
 Counter({'heaven': 1, 'hell': 1}),
 Counter({'blue': 1, 'skies': 1, 'pain': 1}),
 Counter({'can': 1, 'you': 1, 'tell': 1, 'green': 1, 'field': 1}),
 Counter({'from': 1, 'cold': 1, 'steel': 1, 'rail': 1}),
 Counter({'smile': 1, 'veil': 1}),
 Counter({'you': 2, 'do': 1, 'think': 1, 'can': 1, 'tell': 1})]


In [7]:
count_vector = CountVectorizer(documents)
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8',
                input=['So, so you think you can tell', 'Heaven from hell?',
                       'Blue skies from pain?', 'Can you tell a green field',
                       'From a cold steel rail?', 'A smile from a veil?',
                       'Do you think you can tell?'],
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [8]:
count_vector.fit(documents)
count_vector.get_feature_names()

['blue',
 'can',
 'cold',
 'do',
 'field',
 'from',
 'green',
 'heaven',
 'hell',
 'pain',
 'rail',
 'skies',
 'smile',
 'so',
 'steel',
 'tell',
 'think',
 'veil',
 'you']

In [9]:
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 1, 0, 2],
       [0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 2]])

In [10]:
frequency_matrix = pd.DataFrame(doc_array,index=documents,columns=count_vector.get_feature_names())
frequency_matrix

Unnamed: 0,blue,can,cold,do,field,from,green,heaven,hell,pain,rail,skies,smile,so,steel,tell,think,veil,you
"So, so you think you can tell",0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,1,1,0,2
Heaven from hell?,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0
Blue skies from pain?,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0
Can you tell a green field,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1
From a cold steel rail?,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
A smile from a veil?,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
Do you think you can tell?,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,2
