## Ways to determine the similarity score between 2 documents:
### 1. Jaccard Similarity
### 2. Cosine Similarity
### 3. SpaCy similraity
### 4. Bleu Score

In [4]:
import spacy
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
with open(r'./All_Rap_lyrics.txt', encoding='utf-8') as f1:
    data = f1.read().replace('\n', ' ')
    
data[:100]

"\ufeffAw, yeah (It's like this, like this) it's Eminem baby, back up in that motherfucking ass (Till fore"

In [11]:
with open('rap_song_generated.txt') as f2:
    data2 = f2.read().replace('\n', ' ')
    
data2[:100]

'yeah im the rhyming boy you think youre funny yall dont really know me best then i leave you breathl'

In [44]:
nlp = spacy.load('en_core_web_sm')

In [45]:
doc1 = nlp(data)
doc2 = nlp(data2)

In [46]:
print ('SpaCy similarity score is: ',doc1.similarity(doc2))

  "__main__", mod_spec)


SpaCy similarity score is:  0.9244528546827435


## Jaccard Similarity

In [42]:
def get_jaccard_sim(text1, text2): 
    a = set(text1.split()) 
    b = set(text2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [43]:
print('Jaccard similarity is: ', get_jaccard_sim(data,data2))

Jaccard similarity is:  0.023085669088931733


## Cosine similarity calculates similarity by measuring the cosine of angle between two vectors. With cosine similarity, we need to convert sentences into vectors. One way to do that is to use bag of words with either TF (term frequency) or TF-IDF (term frequency- inverse document frequency).
### TF is good for text similarity in general, but TF-IDF is good for search query relevance.

In [47]:
import nltk, string
from nltk.stem import WordNetLemmatizer
from pywsd.utils import lemmatize_sentence
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt') 
nltk.download('wordnet') 
nltk.download('averaged_perceptron_tagger')

stemmer = nltk.stem.porter.PorterStemmer()
lmm = WordNetLemmatizer()
word_list1 = nltk.word_tokenize(data)
print(word_list1)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91880\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91880\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\91880\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!







In [24]:
lemmatized_output1 = ' '.join([lmm.lemmatize(w) for w in word_list1])

In [25]:
word_list2 = nltk.word_tokenize(data2)
print(word_list2)

['yeah', 'im', 'the', 'rhyming', 'boy', 'you', 'think', 'youre', 'funny', 'yall', 'dont', 'really', 'know', 'me', 'best', 'then', 'i', 'leave', 'you', 'breathless', 'gasps', 'you', 'know', 'i', 'took', 'a', 'few', 'gold', 'chains', 'got', 'ta', 'keep', 'it', 'basement', 'and', 'if', 'you', 'make', 'your', 'shit', 'for', 'all', 'my', 'homies', 'say', 'you', 'need', 'you', 'anyway', 'but', 'yo', 'this', 'blunts', 'for', 'you', 'lames', 'every', 'joke', 'that', 'they', 'was', 'ill', 'see', 'you', 'is', 'it', 'too', 'its', 'time', 'for', 'the', 'do', 'i', 'called', 'her', 'up', 'to', 'and', 'imma', 'call', 'your', 'crew', 'that', 'seem', 'to', 'only', 'date', 'white', 'men', 'not', 'us', 'so', 'you', 'can', 'predict', 'what', 'happen', 'this', 'another', 'one', 'from', 'your', 'spray', 'tan', 'but', 'this', 'shit', 'get', 'dangerous', 'show', 'love', 'i', 'need', 'jesus', 'and', 'when', 'it', 'breeze', 'yo', 'ooh', 'i', 'be', 'on', 'the', 'row', 'let', 'me', 'know', 'if', 'she', 'know', 'b




In [48]:
lemmatized_output2 = ' '.join([lmm.lemmatize(w) for w in word_list2])
print(lemmatized_output2)

yeah im the rhyming boy you think youre funny yall dont really know me best then i leave you breathless gasp you know i took a few gold chain got ta keep it basement and if you make your shit for all my homies say you need you anyway but yo this blunts for you lame every joke that they wa ill see you is it too it time for the do i called her up to and imma call your crew that seem to only date white men not u so you can predict what happen this another one from your spray tan but this shit get dangerous show love i need jesus and when it breeze yo ooh i be on the row let me know if she know back against the wall flow shouldve got that shit go rep for my one like a flat drum cuz he supposed to buy me some im withdrawn from crack addict and i release def cut you couldnt make the donut but we dont strike out nowadays rapping is a ice storm i got something for u but those can kiss my as well take the phoniness you in a graduating class and say a word it yacht and still the same spot she go

In [52]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

In [53]:
print(cosine_sim(lemmatized_output1, lemmatized_output2))
tf = vectorizer.fit_transform([lemmatized_output1, lemmatized_output2])
similarity_matrix = cosine_similarity(tf)
print(similarity_matrix)

0.759150529976124
[[1.         0.75915053]
 [0.75915053 1.        ]]


In [51]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu

In [38]:
data



In [39]:
data2

'yeah im the rhyming boy you think youre funny yall dont really know me best then i leave you breathless gasps you know i took a few gold chains gotta keep it basement and if you make your shit for all my homies say you need you anyway but yo this blunts for you lames every joke that they was ill see you is it too its time for the do i called her up to and imma call your crew that seem to only date white men not us so you can predict what happen this another one from your spray tan but this shit get dangerous show love i need jesus and when it breeze yo ooh i be on the row let me know if she know back against the wall flow shouldve got that shit go rep for my ones like a flat drum cuz hes supposed to buy me some im withdrawn from crack addicts and i release def cuts you couldnt make the donuts but we dont strike out nowadays rapping is a ice storm i got something for us but those can kiss my ass well take the phoniness you in a graduating class and say a word its yacht and still the sa

## Bleu Score

In [54]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu

In [55]:
b = data.split(' ')

In [56]:
a = data2.split( ' ')

In [57]:
score = sentence_bleu(a, b)
print(score)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


1.7710701197517793e-232
