In [1]:
# pip install nltk

#Installing Libraries


In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from numpy.linalg import norm
import pandas as pd
from fractions import Fraction
import re
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:


s1= "data science is one of the most important courses in computer science".lower()
s2= "this is one of the best data science courses".lower()
s3= "the data scientists perform data analysis".lower()


s2=re.sub(r"[,]","",s2)
corpus = [s1,s2,s3]
df1 = pd.DataFrame(corpus)
df1

Unnamed: 0,0
0,data science is one of the most important cour...
1,this is one of the best data science courses
2,the data scientists perform data analysis


In [4]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# Vocabolary

In [5]:
features = vectorizer.get_feature_names_out()
features

array(['analysis', 'best', 'computer', 'courses', 'data', 'important',
       'in', 'is', 'most', 'of', 'one', 'perform', 'science',
       'scientists', 'the', 'this'], dtype=object)

# Bag of Words

In [6]:
print(X.toarray())

[[0 0 1 1 1 1 1 1 1 1 1 0 2 0 1 0]
 [0 1 0 1 1 0 0 1 0 1 1 0 1 0 1 1]
 [1 0 0 0 2 0 0 0 0 0 0 1 0 1 1 0]]


In [7]:
df2 = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=["s1","s2","s3"])
df2

Unnamed: 0,analysis,best,computer,courses,data,important,in,is,most,of,one,perform,science,scientists,the,this
s1,0,0,1,1,1,1,1,1,1,1,1,0,2,0,1,0
s2,0,1,0,1,1,0,0,1,0,1,1,0,1,0,1,1
s3,1,0,0,0,2,0,0,0,0,0,0,1,0,1,1,0


# TF

In [8]:
df2.loc["s1"] = [Fraction(i,len(s1.split())) for i in df2.loc["s1"]]
df2.loc["s2"] = [Fraction(i,len(s2.split())) for i in df2.loc["s2"]]
df2.loc["s3"] = [Fraction(i,len(s3.split())) for i in df2.loc["s3"]]

In [9]:
df2

Unnamed: 0,analysis,best,computer,courses,data,important,in,is,most,of,one,perform,science,scientists,the,this
s1,0,0,1/12,1/12,1/12,1/12,1/12,1/12,1/12,1/12,1/12,0,1/6,0,1/12,0
s2,0,1/9,0,1/9,1/9,0,0,1/9,0,1/9,1/9,0,1/9,0,1/9,1/9
s3,1/6,0,0,0,1/3,0,0,0,0,0,0,1/6,0,1/6,1/6,0


# IDF

In [10]:
total_docs = len(df2)
idf = dict()

for i in range(len(features)):
  value = 0
  if features[i] in s1.split():
    value+=1
  if features[i] in s2.split():
    value+=1
  if features[i] in s3.split():
    value+=1
  idf[features[i]]= math.log10(Fraction(total_docs,value))

In [11]:
pd.DataFrame(idf,index=["IDF values"])

Unnamed: 0,analysis,best,computer,courses,data,important,in,is,most,of,one,perform,science,scientists,the,this
IDF values,0.477121,0.477121,0.477121,0.176091,0.0,0.477121,0.477121,0.176091,0.477121,0.176091,0.176091,0.477121,0.176091,0.477121,0.0,0.477121


# TF-IDF

In [12]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)

In [13]:
tfidf_vector = tfidf_vectorizer.fit_transform(corpus)

In [14]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=["s1","s2","s3"], columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,analysis,best,computer,courses,data,important,in,is,most,of,one,perform,science,scientists,the,this
s1,0.0,0.0,0.327476,0.249054,0.193412,0.327476,0.327476,0.249054,0.327476,0.249054,0.249054,0.0,0.498107,0.0,0.193412,0.0
s2,0.0,0.422968,0.0,0.321678,0.249812,0.0,0.0,0.321678,0.0,0.321678,0.321678,0.0,0.321678,0.0,0.249812,0.422968
s3,0.459115,0.0,0.0,0.0,0.542321,0.0,0.0,0.0,0.0,0.0,0.0,0.459115,0.0,0.459115,0.271161,0.0


# Cosine Similarity

In [15]:
S1 = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 1, 0])
S2 = np.array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0 ,1, 1])
S3 = np.array([1,0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0])

In [16]:
sim = np.dot(S1,S3)/(norm(S1)*norm(S3))
print("Cosine Similarity of S1 and S3 = ", sim)

Cosine Similarity of S1 and S3 =  0.5477225575051661
