In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt#plotting library for the Python programming language
#and its numerical mathematics extension NumPy
import nltk

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
#Building a corpus of documents

In [3]:
corpus=['The sky is blue and beautiful.',
       'Love this blue and beautiful sky!',
       'The quick brown fox jumps over the lazy dog.',
       "A king's breakfast has sausages, ham, bacon, eggs, toast, beans",
       'I love green eggs, ham, sausages and bacon!',
       'The brown fox is quick and the blue dog is lazy!',
       'The sky is very blue and the sky is very beautiful today',
       'The dog is lazy but the brown fox is quick!']

In [4]:
labels=['weather', 'weather', 'animals', 'food', 'food','animals', 'weather','animals']

In [5]:
corpus=np.array(corpus)

In [6]:
corpus

array(['The sky is blue and beautiful.',
       'Love this blue and beautiful sky!',
       'The quick brown fox jumps over the lazy dog.',
       "A king's breakfast has sausages, ham, bacon, eggs, toast, beans",
       'I love green eggs, ham, sausages and bacon!',
       'The brown fox is quick and the blue dog is lazy!',
       'The sky is very blue and the sky is very beautiful today',
       'The dog is lazy but the brown fox is quick!'], dtype='<U63')

In [7]:
#Unicode string- U https://docs.scipy.org/doc/numpy-1.15.4/reference/arrays.dtypes.html

In [9]:
corpus_df=pd.DataFrame({'Document':corpus,'Category':labels})

In [10]:
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,"A king's breakfast has sausages, ham, bacon, e...",food
4,"I love green eggs, ham, sausages and bacon!",food
5,The brown fox is quick and the blue dog is lazy!,animals
6,The sky is very blue and the sky is very beaut...,weather
7,The dog is lazy but the brown fox is quick!,animals


In [11]:
corpus_df=corpus_df[['Document','Category']]
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,"A king's breakfast has sausages, ham, bacon, e...",food
4,"I love green eggs, ham, sausages and bacon!",food
5,The brown fox is quick and the blue dog is lazy!,animals
6,The sky is very blue and the sky is very beaut...,weather
7,The dog is lazy but the brown fox is quick!,animals


In [12]:
#Preprocessing Text Corpus

In [13]:
wpt=nltk.WordPunctTokenizer()

In [16]:
stop_words=nltk.corpus.stopwords.words('english')

In [17]:
def normalize_document(doc):
  #lowercase and remove special characters\whitespace
  doc=re.sub(r'[^a-zA-Z\s]',' ',doc,re.I|re.A)#re.I ignore case sensitive, ASCII-only matching
  doc=doc.lower()
  #remove the whitespaces or specific characters from the string at the beginning and end of the string.
  doc=doc.strip()
  #tokenize document
  tokens=wpt.tokenize(doc)
  #filter stopwords out of document
  filterd_tokens=(token for token in tokens if token not in stop_words)
  #re-create documenr from filtered tokens
  doc=' '.join(filterd_tokens)
  return doc

In [18]:
#Basic preprocessing pipeline is ready, we apply our sample corpus

In [19]:
vectorized_corpus=np.vectorize(normalize_document)
vectorized_corpus

<numpy.vectorize at 0x7f87cf3d69b0>

In [20]:
normalized_corpus=vectorized_corpus(corpus)
normalized_corpus

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog',
       'king breakfast sausages ham bacon eggs toast beans',
       'love green eggs ham sausages bacon',
       'brown fox quick blue dog lazy', 'sky blue sky beautiful today',
       'dog lazy brown fox quick'], dtype='<U50')

In [21]:
#Uncomment if you do not have sklearn
#!pip install --user sklearn

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
#Method 1. Get BOW features

In [24]:
cv=CountVectorizer(min_df=0.,max_df=1.)
#occurred in too few documents-min_df
#occurred in too many documents-max_df

In [26]:
#sparse format(mostly zeros)
cv_matrix=cv.fit_transform(normalized_corpus)
cv_matrix

<8x20 sparse matrix of type '<class 'numpy.int64'>'
	with 42 stored elements in Compressed Sparse Row format>

In [27]:
#view non-zero feature positions in the sparce matrix
print(cv_matrix)

  (0, 17)	1
  (0, 3)	1
  (0, 2)	1
  (1, 17)	1
  (1, 3)	1
  (1, 2)	1
  (1, 14)	1
  (2, 15)	1
  (2, 5)	1
  (2, 8)	1
  (2, 11)	1
  (2, 13)	1
  (2, 6)	1
  (3, 12)	1
  (3, 4)	1
  (3, 16)	1
  (3, 10)	1
  (3, 0)	1
  (3, 7)	1
  (3, 18)	1
  (3, 1)	1
  (4, 14)	1
  (4, 16)	1
  (4, 10)	1
  (4, 0)	1
  (4, 7)	1
  (4, 9)	1
  (5, 3)	1
  (5, 15)	1
  (5, 5)	1
  (5, 8)	1
  (5, 13)	1
  (5, 6)	1
  (6, 17)	2
  (6, 3)	1
  (6, 2)	1
  (6, 19)	1
  (7, 15)	1
  (7, 5)	1
  (7, 8)	1
  (7, 13)	1
  (7, 6)	1


In [28]:
#view dense representation, might give error if too big
cv_matrix=cv_matrix.toarray()
cv_matrix

array([[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]])

In [29]:
#get all unique words in the corpus
vocab=cv.get_feature_names_out()
#show document feature matrix
pd.DataFrame(cv_matrix,columns=vocab)

Unnamed: 0,bacon,beans,beautiful,blue,breakfast,brown,dog,eggs,fox,green,ham,jumps,king,lazy,love,quick,sausages,sky,toast,today
0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
2,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,1,0,0,0,0
3,1,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,1,0
4,1,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0
5,0,0,0,1,0,1,1,0,1,0,0,0,0,1,0,1,0,0,0,0
6,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1
7,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0,1,0,0,0,0


In [30]:
#you can set the n-gram range to 1,2 to get unique unigrams as well as bigrams
bv=CountVectorizer(ngram_range=(2,2))
bv_matrix=bv.fit_transform(normalized_corpus)
bv_matrix=bv_matrix.toarray()
vocab=bv.get_feature_names_out()
pd.DataFrame(bv_matrix,columns=vocab)

Unnamed: 0,bacon eggs,beautiful sky,beautiful today,blue beautiful,blue dog,blue sky,breakfast sausages,brown fox,dog lazy,eggs ham,...,lazy dog,love blue,love green,quick blue,quick brown,sausages bacon,sausages ham,sky beautiful,sky blue,toast beans
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0
6,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
7,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
#Method 2.TF-IDF

In [34]:
from sklearn.feature_extraction.text import TfidfTransformer
tt=TfidfTransformer(norm='l2', use_idf=True)
#'l2': sum of squares of vector elements is 1.
#'l1': Sum of absolute values of vector elements is 1.
#The cosine similarity between two vectors is their dot product when l2 norm has been applied
tt_matrix=tt.fit_transform(cv_matrix)
tt_matrix=tt_matrix.toarray()
vocab=cv.get_feature_names_out()
pd.DataFrame(np.round(tt_matrix,2),columns=vocab)

Unnamed: 0,bacon,beans,beautiful,blue,breakfast,brown,dog,eggs,fox,green,ham,jumps,king,lazy,love,quick,sausages,sky,toast,today
0,0.0,0.0,0.6,0.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0
1,0.0,0.0,0.49,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57,0.0,0.0,0.49,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.38,0.38,0.0,0.38,0.0,0.0,0.53,0.0,0.38,0.0,0.38,0.0,0.0,0.0,0.0
3,0.32,0.38,0.0,0.0,0.38,0.0,0.0,0.32,0.0,0.0,0.32,0.0,0.38,0.0,0.0,0.0,0.32,0.0,0.38,0.0
4,0.39,0.0,0.0,0.0,0.0,0.0,0.0,0.39,0.0,0.47,0.39,0.0,0.0,0.0,0.39,0.0,0.39,0.0,0.0,0.0
5,0.0,0.0,0.0,0.37,0.0,0.42,0.42,0.0,0.42,0.0,0.0,0.0,0.0,0.42,0.0,0.42,0.0,0.0,0.0,0.0
6,0.0,0.0,0.36,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.0,0.5
7,0.0,0.0,0.0,0.0,0.0,0.45,0.45,0.0,0.45,0.0,0.0,0.0,0.0,0.45,0.0,0.45,0.0,0.0,0.0,0.0


In [35]:
#TF-IDF for Raw Data

In [36]:
#Calculate TF_IDF for raw data, using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(min_df=0.2,max_df=0.8,norm='l2',
                   #max_df ignore terms that appear in more than x% of the documents, 1 means ignore nothing!
                   #min_df ignore terms that appear in less than x% of the documents
                   use_idf=True,smooth_idf=True)
tv_matrix=tv.fit_transform(normalized_corpus)
tv_matrix=tv_matrix.toarray()
vocab=tv.get_feature_names_out()
pd.DataFrame(np.round(tv_matrix,2),columns=vocab)

Unnamed: 0,bacon,beautiful,blue,brown,dog,eggs,fox,ham,lazy,love,quick,sausages,sky
0,0.0,0.6,0.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6
1,0.0,0.49,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.57,0.0,0.0,0.49
2,0.0,0.0,0.0,0.45,0.45,0.0,0.45,0.0,0.45,0.0,0.45,0.0,0.0
3,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.5,0.0
4,0.45,0.0,0.0,0.0,0.0,0.45,0.0,0.45,0.0,0.45,0.0,0.45,0.0
5,0.0,0.0,0.37,0.42,0.42,0.0,0.42,0.0,0.42,0.0,0.42,0.0,0.0
6,0.0,0.42,0.37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.83
7,0.0,0.0,0.0,0.45,0.45,0.0,0.45,0.0,0.45,0.0,0.45,0.0,0.0
