# Traditional Text Representation and Feature Engineering

## Common Imports

In [1]:
import pandas as pd
import re
import numpy as np

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

import nltk
from nltk import sent_tokenize, word_tokenize
STOPWORDS = nltk.corpus.stopwords.words('english')

## One-Hot Representation

In [2]:
sample = '''This will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine 
everywhere on Tuesday and temperatures of between 22 and 27 degrees. It will warmest in the midlands. Temperatures 
could reach a September record for the century in Ireland, but are unlikely to surpass the 29.1 degrees recorded 
at Kildare’s Clongowes Wood College on September 1st, 1906. Tuesday, however, will be the last day of the sunshine 
with rain arriving across the country on Wednesday morning. Temperatures will remain as high as 24 degrees with the 
warmth punctuated by heavy showers.'''

In [3]:
cleaned_sample = re.sub("[^A-Za-z0-9\s.]", "" , sample.replace('\n', '').lower())
cleaned_sample

'this will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine everywhere on tuesday and temperatures of between 22 and 27 degrees. it will warmest in the midlands. temperatures could reach a september record for the century in ireland but are unlikely to surpass the 29.1 degrees recorded at kildares clongowes wood college on september 1st 1906. tuesday however will be the last day of the sunshine with rain arriving across the country on wednesday morning. temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers.'

In [4]:
tokens_docs = [word_tokenize(doc) for doc in sent_tokenize(cleaned_sample)]
print(tokens_docs)

[['this', 'will', 'be', 'followed', 'by', 'more', 'of', 'the', 'same', 'with', 'the', 'mist', 'and', 'fog', 'clearing', 'to', 'give', 'a', 'day', 'of', 'unbroken', 'sunshine', 'everywhere', 'on', 'tuesday', 'and', 'temperatures', 'of', 'between', '22', 'and', '27', 'degrees', '.'], ['it', 'will', 'warmest', 'in', 'the', 'midlands', '.'], ['temperatures', 'could', 'reach', 'a', 'september', 'record', 'for', 'the', 'century', 'in', 'ireland', 'but', 'are', 'unlikely', 'to', 'surpass', 'the', '29.1', 'degrees', 'recorded', 'at', 'kildares', 'clongowes', 'wood', 'college', 'on', 'september', '1st', '1906.', 'tuesday', 'however', 'will', 'be', 'the', 'last', 'day', 'of', 'the', 'sunshine', 'with', 'rain', 'arriving', 'across', 'the', 'country', 'on', 'wednesday', 'morning', '.'], ['temperatures', 'will', 'remain', 'as', 'high', 'as', '24', 'degrees', 'with', 'the', 'warmth', 'punctuated', 'by', 'heavy', 'showers', '.']]


In [5]:
word_to_id = {token: idx for idx, token in enumerate(set(word_tokenize(cleaned_sample)))}
word_to_id

{'27': 0,
 'more': 1,
 'with': 2,
 'recorded': 3,
 '1906.': 4,
 'surpass': 5,
 'by': 6,
 'degrees': 7,
 'last': 8,
 'warmest': 9,
 'will': 10,
 'clearing': 11,
 'but': 12,
 'century': 13,
 'wood': 14,
 'mist': 15,
 'the': 16,
 'sunshine': 17,
 'showers': 18,
 'across': 19,
 'could': 20,
 'however': 21,
 'wednesday': 22,
 'to': 23,
 'midlands': 24,
 'record': 25,
 'and': 26,
 'temperatures': 27,
 'reach': 28,
 'unlikely': 29,
 '29.1': 30,
 'remain': 31,
 'clongowes': 32,
 'tuesday': 33,
 'warmth': 34,
 'as': 35,
 '24': 36,
 'a': 37,
 'followed': 38,
 'it': 39,
 'morning': 40,
 'heavy': 41,
 'rain': 42,
 'ireland': 43,
 'everywhere': 44,
 'of': 45,
 'for': 46,
 '1st': 47,
 'on': 48,
 '22': 49,
 'this': 50,
 'september': 51,
 'high': 52,
 'give': 53,
 'fog': 54,
 'punctuated': 55,
 'same': 56,
 'arriving': 57,
 'be': 58,
 'day': 59,
 'college': 60,
 'country': 61,
 'unbroken': 62,
 'at': 63,
 'are': 64,
 'in': 65,
 'kildares': 66,
 'between': 67,
 '.': 68}

In [6]:
token_ids = [[word_to_id[token] for token in tokens_doc] for tokens_doc in tokens_docs]
print(token_ids)

[[50, 10, 58, 38, 6, 1, 45, 16, 56, 2, 16, 15, 26, 54, 11, 23, 53, 37, 59, 45, 62, 17, 44, 48, 33, 26, 27, 45, 67, 49, 26, 0, 7, 68], [39, 10, 9, 65, 16, 24, 68], [27, 20, 28, 37, 51, 25, 46, 16, 13, 65, 43, 12, 64, 29, 23, 5, 16, 30, 7, 3, 63, 66, 32, 14, 60, 48, 51, 47, 4, 33, 21, 10, 58, 16, 8, 59, 45, 16, 17, 2, 42, 57, 19, 16, 61, 48, 22, 40, 68], [27, 10, 31, 35, 52, 35, 36, 7, 2, 16, 34, 55, 6, 41, 18, 68]]


In [7]:
num_words = len(word_to_id)
num_words

69

In [8]:
encoded_sequences = []
for each_seq in token_ids:
    encoded_tokens = []
    for each_token_id in each_seq:
        a = np.zeros((1, num_words)) # create a matrix
        a[0, each_token_id] = 1 # replace the position with 1
        encoded_tokens.append(a)
    encoded_sequences.append(encoded_tokens)

In [9]:
encoded_sequences

[[array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0.]]),
  array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0.]]),
  array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0.]]),
  array([[0.,

## Bag-of-Words (BoW)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
vectorizer = CountVectorizer(stop_words=STOPWORDS)

In [12]:
vectorizer.fit([sample])

CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [13]:
vectorizer.vocabulary_ 

{'followed': 18,
 'mist': 27,
 'fog': 17,
 'clearing': 9,
 'give': 19,
 'day': 14,
 'unbroken': 41,
 'sunshine': 37,
 'everywhere': 16,
 'tuesday': 40,
 'temperatures': 39,
 '22': 2,
 '27': 4,
 'degrees': 15,
 'warmest': 43,
 'midlands': 26,
 'could': 12,
 'reach': 31,
 'september': 35,
 'record': 32,
 'century': 8,
 'ireland': 23,
 'unlikely': 42,
 'surpass': 38,
 '29': 5,
 'recorded': 33,
 'kildare': 24,
 'clongowes': 10,
 'wood': 46,
 'college': 11,
 '1st': 1,
 '1906': 0,
 'however': 22,
 'last': 25,
 'rain': 30,
 'arriving': 7,
 'across': 6,
 'country': 13,
 'wednesday': 45,
 'morning': 28,
 'remain': 34,
 'high': 21,
 '24': 3,
 'warmth': 44,
 'punctuated': 29,
 'heavy': 20,
 'showers': 36}

**Note:**

A mapping of terms to feature indices. Logic is similar to word_to_id in One Hot Encoding. Since the count vectorizer was initiated with instrcuting to remove the stopwords, the vocab size decreased.

In [14]:
len(vectorizer.vocabulary_)

47

In [15]:
cv_vector = vectorizer.transform([sample])

In [16]:
cv_vector.shape

(1, 47)

In [17]:
cv_vector.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 3, 2, 1, 1, 1,
        1, 1, 1]], dtype=int64)

The size of the array is the final vocab size (obtained with vectorizer.vocabulary_) and the counts of the tokens' in the text is displayed. For example, the tokne at 14th position appeared twice. In the vocab, at 14 we have the word 'day' and it appears twice in the sample text. Similarly, the word 'degrees' occur thrice, and is in the 15th position which indicates '3' in the cv_vector array.

**Sample Text**

'This will be followed by more of the same with the mist and fog clearing to give a **day** of unbroken sunshine everywhere on Tuesday and temperatures of between 22 and 27 degrees. It will warmest in the midlands. Temperatures could reach a September record for the century in Ireland, but are unlikely to surpass the 29.1 degrees recorded at Kildare’s Clongowes Wood College on September 1st, 1906. Tuesday, however, will be the last **day** of the sunshine with rain arriving across the country on Wednesday morning. Temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers.'

## N-Grams

In [25]:
vectorizer = CountVectorizer(ngram_range=(1, 2))
cv_vector = vectorizer.fit_transform([sample])

In [26]:
cv_vector = cv_vector.toarray()
vocab = vectorizer.get_feature_names()
pd.DataFrame(cv_vector, columns=vocab)

Unnamed: 0,1906,1906 tuesday,1st,1st 1906,22,22 and,24,24 degrees,27,27 degrees,29,29 degrees,across,across the,and,and 27,and fog,and temperatures,are,are unlikely,arriving,arriving across,as,as 24,as high,at,at kildare,be,be followed,be the,between,between 22,but,but are,by,by heavy,by more,century,century in,clearing,clearing to,clongowes,clongowes wood,college,college on,could,could reach,country,country on,day,day of,degrees,degrees it,degrees recorded,degrees with,everywhere,everywhere on,fog,fog clearing,followed,followed by,for,for the,give,give day,heavy,heavy showers,high,high as,however,however will,in,in ireland,in the,ireland,ireland but,it,it will,kildare,kildare clongowes,last,last day,midlands,midlands temperatures,mist,mist and,more,more of,morning,morning temperatures,of,of between,of the,of unbroken,on,on september,on tuesday,on wednesday,punctuated,punctuated by,rain,rain arriving,reach,reach september,record,record for,recorded,recorded at,remain,remain as,same,same with,september,september 1st,september record,showers,sunshine,sunshine everywhere,sunshine with,surpass,surpass the,temperatures,temperatures could,temperatures of,temperatures will,the,the 29,the century,the country,the last,the midlands,the mist,the same,the sunshine,the warmth,this,this will,to,to give,to surpass,tuesday,tuesday and,tuesday however,unbroken,unbroken sunshine,unlikely,unlikely to,warmest,warmest in,warmth,warmth punctuated,wednesday,wednesday morning,will,will be,will remain,will warmest,with,with rain,with the,wood,wood college
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,2,1,1,1,1,2,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4,1,2,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,2,1,1,1,1,3,1,1,1,9,1,1,1,1,1,1,1,1,1,1,1,2,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,4,2,1,1,3,1,2,1,1


In [28]:
from nltk import ngrams
n = 3

for grams in ngrams(word_tokenize(sample), n):
    print(grams)

('This', 'will', 'be')
('will', 'be', 'followed')
('be', 'followed', 'by')
('followed', 'by', 'more')
('by', 'more', 'of')
('more', 'of', 'the')
('of', 'the', 'same')
('the', 'same', 'with')
('same', 'with', 'the')
('with', 'the', 'mist')
('the', 'mist', 'and')
('mist', 'and', 'fog')
('and', 'fog', 'clearing')
('fog', 'clearing', 'to')
('clearing', 'to', 'give')
('to', 'give', 'a')
('give', 'a', 'day')
('a', 'day', 'of')
('day', 'of', 'unbroken')
('of', 'unbroken', 'sunshine')
('unbroken', 'sunshine', 'everywhere')
('sunshine', 'everywhere', 'on')
('everywhere', 'on', 'Tuesday')
('on', 'Tuesday', 'and')
('Tuesday', 'and', 'temperatures')
('and', 'temperatures', 'of')
('temperatures', 'of', 'between')
('of', 'between', '22')
('between', '22', 'and')
('22', 'and', '27')
('and', '27', 'degrees')
('27', 'degrees', '.')
('degrees', '.', 'It')
('.', 'It', 'will')
('It', 'will', 'warmest')
('will', 'warmest', 'in')
('warmest', 'in', 'the')
('in', 'the', 'midlands')
('the', 'midlands', '.')
('

## Term Frequency - Inverse Document Frequent

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
tf_idf = TfidfVectorizer(min_df=0.,
                         max_df=1.,
                         use_idf=True,
                         stop_words=STOPWORDS)

In [35]:
tfidf_vector = tf_idf.fit([sample])
tfidf_vector.vocabulary_

{'followed': 18,
 'mist': 27,
 'fog': 17,
 'clearing': 9,
 'give': 19,
 'day': 14,
 'unbroken': 41,
 'sunshine': 37,
 'everywhere': 16,
 'tuesday': 40,
 'temperatures': 39,
 '22': 2,
 '27': 4,
 'degrees': 15,
 'warmest': 43,
 'midlands': 26,
 'could': 12,
 'reach': 31,
 'september': 35,
 'record': 32,
 'century': 8,
 'ireland': 23,
 'unlikely': 42,
 'surpass': 38,
 '29': 5,
 'recorded': 33,
 'kildare': 24,
 'clongowes': 10,
 'wood': 46,
 'college': 11,
 '1st': 1,
 '1906': 0,
 'however': 22,
 'last': 25,
 'rain': 30,
 'arriving': 7,
 'across': 6,
 'country': 13,
 'wednesday': 45,
 'morning': 28,
 'remain': 34,
 'high': 21,
 '24': 3,
 'warmth': 44,
 'punctuated': 29,
 'heavy': 20,
 'showers': 36}

In [36]:
len(tfidf_vector.vocabulary_)

47

In [37]:
tfidf_vector = tf_idf.transform([sample])
tfidf_vector.toarray()

array([[0.11547005, 0.11547005, 0.11547005, 0.11547005, 0.11547005,
        0.11547005, 0.11547005, 0.11547005, 0.11547005, 0.11547005,
        0.11547005, 0.11547005, 0.11547005, 0.11547005, 0.23094011,
        0.34641016, 0.11547005, 0.11547005, 0.11547005, 0.11547005,
        0.11547005, 0.11547005, 0.11547005, 0.11547005, 0.11547005,
        0.11547005, 0.11547005, 0.11547005, 0.11547005, 0.11547005,
        0.11547005, 0.11547005, 0.11547005, 0.11547005, 0.11547005,
        0.23094011, 0.11547005, 0.23094011, 0.11547005, 0.34641016,
        0.23094011, 0.11547005, 0.11547005, 0.11547005, 0.11547005,
        0.11547005, 0.11547005]])

In [38]:
tfidf_vector.toarray().shape

(1, 47)

In [39]:
vocab = tf_idf.get_feature_names()
pd.DataFrame(np.round(tfidf_vector.toarray(), 2), columns=vocab)

Unnamed: 0,1906,1st,22,24,27,29,across,arriving,century,clearing,clongowes,college,could,country,day,degrees,everywhere,fog,followed,give,heavy,high,however,ireland,kildare,last,midlands,mist,morning,punctuated,rain,reach,record,recorded,remain,september,showers,sunshine,surpass,temperatures,tuesday,unbroken,unlikely,warmest,warmth,wednesday,wood
0,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.23,0.35,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.12,0.23,0.12,0.23,0.12,0.35,0.23,0.12,0.12,0.12,0.12,0.12,0.12


## Count Vectorizer and Tf-idf on sent_tokenized(sample)

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
vectorizer = CountVectorizer() # Not using stopwords to demo the behavior with multiple token occurrence in one document

In [42]:
vectorizer.fit(sent_tokenize(cleaned_sample)) # here each element in the sent_tokenize list is a doc

CountVectorizer()

In [43]:
vectorizer.transform(sent_tokenize(cleaned_sample)).toarray()

array([[0, 0, 1, 0, 1, 0, 0, 3, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 3, 1, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0],
       [1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 2, 0,
        1, 1, 1, 1, 0, 0, 2, 0, 1, 1, 1, 5, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
        1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
        0]], dtype=int64)

In [44]:
encoded_vector = vectorizer.transform(sent_tokenize(cleaned_sample)).toarray()
len(encoded_vector[0])

67

In [45]:
len(vectorizer.vocabulary_)

67

In [46]:
vectorizer.vocabulary_

{'this': 56,
 'will': 64,
 'be': 12,
 'followed': 26,
 'by': 15,
 'more': 39,
 'of': 41,
 'the': 55,
 'same': 49,
 'with': 65,
 'mist': 38,
 'and': 7,
 'fog': 25,
 'clearing': 17,
 'to': 57,
 'give': 28,
 'day': 22,
 'unbroken': 59,
 'sunshine': 52,
 'everywhere': 24,
 'on': 42,
 'tuesday': 58,
 'temperatures': 54,
 'between': 13,
 '22': 2,
 '27': 4,
 'degrees': 23,
 'it': 34,
 'warmest': 61,
 'in': 32,
 'midlands': 37,
 'could': 20,
 'reach': 45,
 'september': 50,
 'record': 46,
 'for': 27,
 'century': 16,
 'ireland': 33,
 'but': 14,
 'are': 8,
 'unlikely': 60,
 'surpass': 53,
 '29': 5,
 'recorded': 47,
 'at': 11,
 'kildares': 35,
 'clongowes': 18,
 'wood': 66,
 'college': 19,
 '1st': 1,
 '1906': 0,
 'however': 31,
 'last': 36,
 'rain': 44,
 'arriving': 9,
 'across': 6,
 'country': 21,
 'wednesday': 63,
 'morning': 40,
 'remain': 48,
 'as': 10,
 'high': 30,
 '24': 3,
 'warmth': 62,
 'punctuated': 43,
 'heavy': 29,
 'showers': 51}

**Note:**

All document is of same length as the vocabulary.

Explanation - 
1. 1st document, 1st word or position 0 = 0 ; because the word at index 0 is 1906 in the vocabulary. 1906 is not there in this document. Thus, it is zero.

2. 1st document, 2nd word or position 1 = 0 ; because the word at index 1 is 1st in the vocabulary. 1st is not there in this document. Thus, it is zero.

3. 1st document, 3rd word or position 2 = 1 ; because the word at index 2 is 22 in the vocabulary. 22 is there in this document. Thus, it is 1.

3. 1st document, 8th word or position 7  = 3 ; because the word at index 7 is 'and' in the vocabulary. 'and' is there in this document thrice. Thus, it is 3.

**Ref**

'This will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine \neverywhere on Tuesday and temperatures of between 22 and 27 degrees.',

 'It will warmest in the midlands.',
 
 'Temperatures \ncould reach a September record for the century in Ireland, but are unlikely to surpass the 29.1 degrees recorded \nat Kildare’s Clongowes Wood College on September 1st, 1906.',
 
 'Tuesday, however, will be the last day of the sunshine \nwith rain arriving across the country on Wednesday morning.',
 'Temperatures will remain as high as 24 degrees with the \nwarmth punctuated by heavy showers.'

In [47]:
tf_idf = TfidfVectorizer()
tf_idf.fit(sent_tokenize(cleaned_sample))

TfidfVectorizer()

In [48]:
tf_idf.transform(sent_tokenize(cleaned_sample)).toarray()

array([[0.        , 0.        , 0.17017904, 0.        , 0.17017904,
        0.        , 0.        , 0.51053712, 0.        , 0.        ,
        0.        , 0.        , 0.13417111, 0.17017904, 0.        ,
        0.13417111, 0.        , 0.17017904, 0.        , 0.        ,
        0.        , 0.        , 0.13417111, 0.10862308, 0.17017904,
        0.17017904, 0.17017904, 0.        , 0.17017904, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.17017904, 0.17017904,
        0.        , 0.40251333, 0.13417111, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.17017904,
        0.        , 0.        , 0.13417111, 0.        , 0.10862308,
        0.17761297, 0.17017904, 0.13417111, 0.13417111, 0.17017904,
        0.        , 0.        , 0.        , 0.        , 0.08880648,
        0.10862308, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.     

In [49]:
vocab = tf_idf.get_feature_names()
pd.DataFrame(np.round(tf_idf.transform(sent_tokenize(cleaned_sample)).toarray(), 2), columns=vocab)

Unnamed: 0,1906,1st,22,24,27,29,across,and,are,arriving,as,at,be,between,but,by,century,clearing,clongowes,college,could,country,day,degrees,everywhere,fog,followed,for,give,heavy,high,however,in,ireland,it,kildares,last,midlands,mist,more,morning,of,on,punctuated,rain,reach,record,recorded,remain,same,september,showers,sunshine,surpass,temperatures,the,this,to,tuesday,unbroken,unlikely,warmest,warmth,wednesday,will,with,wood
0,0.0,0.0,0.17,0.0,0.17,0.0,0.0,0.51,0.0,0.0,0.0,0.0,0.13,0.17,0.0,0.13,0.0,0.17,0.0,0.0,0.0,0.0,0.13,0.11,0.17,0.17,0.17,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.17,0.0,0.4,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.13,0.0,0.11,0.18,0.17,0.13,0.13,0.17,0.0,0.0,0.0,0.0,0.09,0.11,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.39,0.0,0.49,0.0,0.0,0.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26,0.0,0.0,0.0,0.0,0.0,0.49,0.0,0.0,0.26,0.0,0.0
2,0.15,0.15,0.0,0.0,0.0,0.15,0.15,0.0,0.15,0.15,0.0,0.15,0.12,0.0,0.15,0.0,0.15,0.0,0.15,0.15,0.15,0.15,0.12,0.09,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.15,0.12,0.15,0.0,0.15,0.15,0.0,0.0,0.0,0.15,0.12,0.23,0.0,0.15,0.15,0.15,0.15,0.0,0.0,0.29,0.0,0.12,0.15,0.09,0.38,0.0,0.12,0.12,0.0,0.15,0.0,0.0,0.15,0.08,0.09,0.15
3,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.0,0.0,0.0,0.55,0.0,0.0,0.0,0.0,0.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.27,0.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.0,0.27,0.0,0.0,0.27,0.0,0.0,0.17,0.14,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.0,0.14,0.17,0.0
