In [1]:
import re
import string 
import pandas as pd
from functools import reduce
from math import log


In [2]:
corpus = """ 
From fairest creatures we desire increase,
That thereby beauty's rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou, contracted to thine own bright eyes,
Feed'st thy light'st flame with self-substantial fuel,
Making a famine where abundance lies,
Thyself thy foe, to thy sweet self too cruel.
Thou that art now the world's fresh ornament
And only herald to the gaudy spring,
Within thine own bud buriest thy content
And, tender churl, makest waste in niggarding.
Pity the world, or else this glutton be,
To eat the world's due, by the grave and thee.

""". split("\n")[1:-1]

In [3]:
l_a = corpus[0].lower().split()
l_b = corpus[1].lower().split()
l_c = corpus[2].lower().split()


print(l_a)
print(l_b)
print(l_c)


['from', 'fairest', 'creatures', 'we', 'desire', 'increase,']
['that', 'thereby', "beauty's", 'rose', 'might', 'never', 'die,']
['but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease,']


In [4]:
word_set = set(l_a).union(set(l_b)).union(set(l_c))
print(word_set)

set(['desire', 'we', 'from', 'decease,', 'thereby', 'that', 'rose', 'never', 'increase,', "beauty's", 'should', 'by', 'riper', 'as', 'but', 'die,', 'time', 'the', 'might', 'fairest', 'creatures'])


In [5]:
word_dict_a = dict.fromkeys(word_set, 0)
word_dict_b = dict.fromkeys(word_set, 0)
word_dict_c = dict.fromkeys(word_set, 0)


for word in l_a:
    word_dict_a[word] += 1 

for word in l_b:
    word_dict_b[word] += 1 

for word in l_c:
    word_dict_c[word] += 1 

pd.DataFrame([word_dict_a, word_dict_b, word_dict_c])

Unnamed: 0,as,beauty's,but,by,creatures,"decease,",desire,"die,",fairest,from,...,might,never,riper,rose,should,that,the,thereby,time,we
0,0,0,0,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,1,0,0,...,1,1,0,1,0,1,0,1,0,0
2,1,0,1,1,0,1,0,0,0,0,...,0,0,1,0,1,0,1,0,1,0


In [6]:
def compute_tf(word_dict, l):
    tf = {}
    sum_nk = len(l)
    for word, count in word_dict.items():
        tf[word] =  count / sum_nk
    return tf

In [7]:
tf_a = compute_tf(word_dict_a, l_a)
tf_b = compute_tf(word_dict_b, l_b)
tf_c = compute_tf(word_dict_c, l_c)

In [8]:
def compute_idf(strings_list):
    n  = len(strings_list)
    idf = dict.fromkeys(strings_list[0].keys(), 0)
    for l in strings_list:
        for word, count in l.items():
            if count > 0:
                idf[word] += 1
                
    for word, v in idf.items():
        idf[word] = log(n / float(v))
    return idf

In [9]:
idf = compute_idf([word_dict_a, word_dict_b, word_dict_c                   
                  ])

In [10]:
def compute_tf_idf(tf, idf):
    tf_idf = dict.fromkeys(tf.keys(), 0)
    for word, v in tf.items():
        tf_idf[word] = v * idf[word]
    return tf_idf

In [11]:
tf_idf_a = compute_tf_idf(tf_a, idf)
tf_idf_b = compute_tf_idf(tf_b, idf)
tf_idf_c = compute_tf_idf(tf_c, idf)

In [12]:
pd.DataFrame([tf_idf_a, tf_idf_b, tf_idf_c])

Unnamed: 0,as,beauty's,but,by,creatures,"decease,",desire,"die,",fairest,from,...,might,never,riper,rose,should,that,the,thereby,time,we
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [14]:
all_text = """
From fairest creatures we desire increase,
That thereby beauty's rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou, contracted to thine own bright eyes,
Feed'st thy light'st flame with self-substantial fuel,
Making a famine where abundance lies,
Thyself thy foe, to thy sweet self too cruel.
Thou that art now the world's fresh ornament
And only herald to the gaudy spring,
Within thine own bud buriest thy content
And, tender churl, makest waste in niggarding.
Pity the world, or else this glutton be,
To eat the world's due, by the grave and thee.

""".split("\n")[1:-1]

In [15]:
def preprocessing(line):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    return line

In [16]:
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing)
tfidf = tfidf_vectorizer.fit_transform(all_text)

In [17]:
kmeans = KMeans(n_clusters = 2).fit(tfidf)

In [18]:
lines_for_predicting = ["Pity the world, or else this glutton be,",
                        "But as the riper should by time decease,"
                       ]
kmeans.predict(tfidf_vectorizer.transform(lines_for_predicting))

array([0, 0], dtype=int32)

In [19]:
lines_for_predicting = ["But as the riper should by time decease,",
                        "But as the riper should by time decease,"
                       ]
kmeans.predict(tfidf_vectorizer.transform(lines_for_predicting))

array([0, 0], dtype=int32)

In [20]:
lines_for_predicting = ["From fairest creatures we desire increase,",
                        "But as the riper should by time decease,"
                       ]
kmeans.predict(tfidf_vectorizer.transform(lines_for_predicting))

array([0, 0], dtype=int32)

In [21]:
lines_for_predicting = ["tf and idf is awesome!", "some androids is there"]
kmeans.predict(tfidf_vectorizer.transform(lines_for_predicting))

array([0, 0], dtype=int32)

Reference : https://medium.com/@MSalnikov/text-clustering-with-k-means-and-tf-idf-f099bcf95183