In [1]:
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [2]:
punct = [word for word in string.punctuation]
print(punct)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [3]:
stop_words = list(ENGLISH_STOP_WORDS)
print(stop_words)

['across', 'themselves', 'front', 'therefore', 'below', 'first', 'though', 'both', 'find', 'should', 'thence', 'one', 'get', 'take', 'top', 'interest', 'fill', 'over', 'you', 'above', 'eleven', 'those', 'whoever', 'her', 'what', 'may', 'until', 'often', 'amongst', 'this', 'has', 'seem', 'few', 'because', 'must', 'me', 'nothing', 'through', 'ours', 'further', 'could', 'so', 'etc', 'hers', 'empty', 'back', 'ever', 'give', 'made', 'him', 'its', 'no', 'than', 'part', 'are', 'mine', 'fire', 'within', 'eg', 'everyone', 'might', 'sixty', 'therein', 'seems', 'side', 'somehow', 'own', 'latter', 'twelve', 'become', 'something', 'too', 'hereby', 'otherwise', 'already', 'ten', 'myself', 'their', 'where', 'amoungst', 'am', 'anywhere', 'forty', 'more', 'onto', 'almost', 'becomes', 'mill', 'becoming', 'wherever', 'whereupon', 'itself', 'whereby', 'being', 'very', 'himself', 'afterwards', 'most', 'however', 'mostly', 'thus', 'that', 'all', 'whose', 'fifty', 'couldnt', 'after', 'although', 'whole', 'if

In [4]:
corpus = [
    "John like horror movie.",
   "Ryan watches movie and dramatic movies."
]
corpus

['John like horror movie.', 'Ryan watches movie and dramatic movies.']

In [5]:
pre_proc_vocab = []
for doc in corpus:
    doc_as_list = doc.split(" ")
    for word in doc_as_list:
        word_lower = str(word).lower().replace('.','') 
        if word_lower not in stop_words:
            pre_proc_vocab.append(word_lower)
            
print(pre_proc_vocab)

['john', 'like', 'horror', 'movie', 'ryan', 'watches', 'movie', 'dramatic', 'movies']


In [6]:
corpus

['John like horror movie.', 'Ryan watches movie and dramatic movies.']

In [7]:
pd.set_option('display.max_columns',70)

#### **Applying Count Vectoriser**

In [60]:
cv = CountVectorizer(binary=True,analyzer='word',ngram_range=(1,3),max_df=1)
cv.fit(corpus)

CountVectorizer(binary=True, max_df=1, ngram_range=(1, 3))

In [61]:
cv_corp_results = cv.transform(corpus)
cv_corp_results

<2x22 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [62]:
pd.DataFrame(cv_corp_results.todense(),columns=cv.get_feature_names())

Unnamed: 0,and,and dramatic,and dramatic movies,dramatic,dramatic movies,horror,horror movie,john,john like,john like horror,like,like horror,like horror movie,movie and,movie and dramatic,movies,ryan,ryan watches,ryan watches movie,watches,watches movie,watches movie and
0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1


#### **Applying Tf-IDF Vectoriser**

In [67]:
tf = TfidfVectorizer(ngram_range=(1,2),analyzer='word',binary=False,max_df=2)
tf.fit(corpus)

TfidfVectorizer(max_df=2, ngram_range=(1, 2))

In [68]:
tf.vocabulary_, tf.idf_,

({'john': 6,
  'like': 8,
  'horror': 4,
  'movie': 10,
  'john like': 7,
  'like horror': 9,
  'horror movie': 5,
  'ryan': 13,
  'watches': 15,
  'and': 0,
  'dramatic': 2,
  'movies': 12,
  'ryan watches': 14,
  'watches movie': 16,
  'movie and': 11,
  'and dramatic': 1,
  'dramatic movies': 3},
 array([1.40546511, 1.40546511, 1.40546511, 1.40546511, 1.40546511,
        1.40546511, 1.40546511, 1.40546511, 1.40546511, 1.40546511,
        1.        , 1.40546511, 1.40546511, 1.40546511, 1.40546511,
        1.40546511, 1.40546511]))

In [69]:
tfidf_corp_results = tf.transform(corpus)
tfidf_corp_results

<2x17 sparse matrix of type '<class 'numpy.float64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [70]:
pd.DataFrame(tfidf_corp_results.toarray(),columns=tf.get_feature_names())

Unnamed: 0,and,and dramatic,dramatic,dramatic movies,horror,horror movie,john,john like,like,like horror,movie,movie and,movies,ryan,ryan watches,watches,watches movie
0,0.0,0.0,0.0,0.0,0.392044,0.392044,0.392044,0.392044,0.392044,0.392044,0.278943,0.0,0.0,0.0,0.0,0.0,0.0
1,0.308515,0.308515,0.308515,0.308515,0.0,0.0,0.0,0.0,0.0,0.0,0.219511,0.308515,0.308515,0.308515,0.308515,0.308515,0.308515


In [63]:
tf = TfidfVectorizer(ngram_range=(1,2),analyzer='word',binary=False,min_df=2)
tf.fit(corpus)

TfidfVectorizer(min_df=2, ngram_range=(1, 2))

In [64]:
tf.vocabulary_, tf.idf_,

({'movie': 0}, array([1.]))

In [65]:
tfidf_corp_results = tf.transform(corpus)
tfidf_corp_results

<2x1 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [66]:
pd.DataFrame(tfidf_corp_results.toarray(),columns=tf.get_feature_names())

Unnamed: 0,movie
0,1.0
1,1.0


#### **Applying Tf-IDF Transformer**

In [198]:
tf_idf_tr = TfidfTransformer(sublinear_tf=True)
tf_idf_tr.fit(cv_corp_results)

TfidfTransformer(sublinear_tf=True)

In [199]:
tfidf_transformer_results = tf_idf_tr.transform(cv_corp_results)
tfidf_transformer_results

<2x23 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [200]:
pd.DataFrame(tfidf_transformer_results.todense(),columns=tf.get_feature_names())

Unnamed: 0,and,and dramatic,and dramatic movies,dramatic,dramatic movies,horror,horror movie,john,john like,john like horror,like,like horror,like horror movie,movie,movie and,movie and dramatic,movies,ryan,ryan watches,ryan watches movie,watches,watches movie,watches movie and
0,0.0,0.0,0.0,0.0,0.0,0.342871,0.342871,0.342871,0.342871,0.342871,0.342871,0.342871,0.342871,0.243956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.262556,0.262556,0.262556,0.262556,0.262556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186811,0.262556,0.262556,0.262556,0.262556,0.262556,0.262556,0.262556,0.262556,0.262556


#### **On Unseen Dataset**

In [201]:
test_data = ["his name is horror and he likes horror movies.",
            "i like working on python-like languages."] 

In [202]:
tt = pd.DataFrame(cv.transform(test_data).todense(),columns=cv.get_feature_names())
tt

Unnamed: 0,and,and dramatic,and dramatic movies,dramatic,dramatic movies,horror,horror movie,john,john like,john like horror,like,like horror,like horror movie,movie,movie and,movie and dramatic,movies,ryan,ryan watches,ryan watches movie,watches,watches movie,watches movie and
0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [203]:
pd.DataFrame(tf.transform(test_data).todense(),columns=tf.get_feature_names())

Unnamed: 0,and,and dramatic,and dramatic movies,dramatic,dramatic movies,horror,horror movie,john,john like,john like horror,like,like horror,like horror movie,movie,movie and,movie and dramatic,movies,ryan,ryan watches,ryan watches movie,watches,watches movie,watches movie and
0,0.57735,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [204]:
pd.DataFrame(tf_idf_tr.transform(tt).todense(),columns=tf.get_feature_names())

Unnamed: 0,and,and dramatic,and dramatic movies,dramatic,dramatic movies,horror,horror movie,john,john like,john like horror,like,like horror,like horror movie,movie,movie and,movie and dramatic,movies,ryan,ryan watches,ryan watches movie,watches,watches movie,watches movie and
0,0.57735,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
