In [95]:
#importing warnings
import warnings
warnings.filterwarnings('ignore')

In [96]:
#importing needed packages
import pandas as pd
import numpy as np
import re
import nltk

In [97]:
#creating corpus out of documents
createCorpus = ['Staying up to date with Data Science, whilst filtering the gold from the impurities is imperative for any Data Scientist who is serious about mastering their craft.',
          'Datacamp may be famous for their intensive courses that cover the full journey from an aspiring Data Scientist to a full-time professional.']
labels = ['data-science', 'data-camp']

createCorpus = np.array(createCorpus)
corpus = pd.DataFrame({'Document': createCorpus, 
                          'Label': labels})
corpus = corpus[['Document', 'Label']]
corpus

Unnamed: 0,Document,Label
0,"Staying up to date with Data Science, whilst f...",data-science
1,Datacamp may be famous for their intensive cou...,data-camp


In [98]:
#removing stop words, lowercase, extra spaces
wordPunct = nltk.WordPunctTokenizer()
stopWords = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = wordPunct.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stopWords]
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(createCorpus)
norm_corpus

array(['staying date data science whilst filtering gold impurities imperative data scientist serious mastering craft',
       'datacamp may famous intensive courses cover full journey aspiring data scientist fulltime professional'],
      dtype='<U108')

In [99]:
#creating tf-tdf matrix using vectorizer
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(min_df=0., max_df=1.)
createMatrix = matrix.fit_transform(norm_corpus)
print(createMatrix)

  (0, 22)	1
  (0, 6)	1
  (0, 4)	2
  (0, 19)	1
  (0, 23)	1
  (0, 8)	1
  (0, 11)	1
  (0, 13)	1
  (0, 12)	1
  (0, 20)	1
  (0, 21)	1
  (0, 16)	1
  (0, 3)	1
  (1, 4)	1
  (1, 20)	1
  (1, 5)	1
  (1, 17)	1
  (1, 7)	1
  (1, 14)	1
  (1, 1)	1
  (1, 2)	1
  (1, 9)	1
  (1, 15)	1
  (1, 0)	1
  (1, 10)	1
  (1, 18)	1


In [100]:
#calculating document similarity using cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
vectorize = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
                     use_idf=True, smooth_idf=True)
vectorizeMatrix = vectorize.fit_transform(norm_corpus)
vectorizeMatrix = vectorizeMatrix.toarray()
vocab = vectorize.get_feature_names()
pd.DataFrame(np.round(vectorizeMatrix, 2), columns=vocab)

from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(vectorizeMatrix)
docSimilarity = pd.DataFrame(similarity_matrix)
docSimilarity

Unnamed: 0,0,1
0,1.0,0.119123
1,0.119123,1.0
