In [1]:
import pandas as pd
import scipy
import sklearn
from sklearn import *
import numpy as np
import os

In [2]:
path_data =  os.path.expanduser('~') 

# use this to train and VALIDATE your solution
train_df = pd.read_csv("./quora_train_data.csv")

# use this to provide the expected generalization results
test_df = pd.read_csv("./quora_test_data.csv")

In [3]:
train_df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,346692,38482,10706,Why do I get easily bored with everything?,Why do I get bored with things so quickly and ...,1
1,327668,454117,345117,How do I study for Honeywell company recruitment?,How do I study for Honeywell company recruitme...,1
2,272993,391373,391374,Which search engine algorithm is Quora using?,Why is Quora not using reliable search engine?,0
3,54070,82673,95496,How can I smartly cut myself?,Can someone who thinks about suicide for 7 yea...,0
4,46450,38384,72436,How do I see who is viewing my Instagram videos?,Can one tell who viewed my Instagram videos?,1
...,...,...,...,...,...,...
323427,192476,292119,292120,Is it okay to use a laptop while it is chargin...,Is it OK to use your phone while charging?,0
323428,17730,33641,33642,How can dogs understand human language?,Can dogs understand the human language?,0
323429,28030,52012,52013,What's your favourite lotion?,What's your favourite skin lotion?,1
323430,277869,397054,120852,How does one become a hedge fund manager?,What should I do to become a hedge fund manager?,1


# Out of vocabulary count

In [4]:
def oov_count(text, vocab):
    """
    Computes the number of out of vocabulary words in a text given a vocabulary.
    
    Parameters:
        text (str): The text to compute the OOV counts for.
        vocab (set): A set containing the vocabulary of known words.
        
    Returns:
        int: The number of out of vocabulary words in the text.
    """
    words = text.split()
    oov_words = [word for word in words if word.lower() not in vocab]
    return len(oov_words)

### Example of use

In [5]:
# Define a text to process
text = 'The quick brown fox jumped over the lazy dog.'

# Create a set of known words (vocabulary)
vocab = set(['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog'])

# Compute the OOV count
oov_count = oov_count(text, vocab)

# Print the results
print(f'The text contains {oov_count} out of vocabulary words.')

# Add the OOV words to the vocabulary set
words = text.split()
oov_words = [word.lower() for word in words if word.lower() not in vocab]
vocab.update(oov_words)

# Print the updated vocabulary set
print('Updated vocabulary set:', vocab)

The text contains 2 out of vocabulary words.
Updated vocabulary set: {'dog', 'jumps', 'lazy', 'quick', 'jumped', 'over', 'brown', 'the', 'fox', 'dog.'}


In [6]:
# Define a test set
test_set = 'the quick brown cat jumps over the lazy dog'

# Replace OOV words with <unk>
words = test_set.split()
oov_words = set([word.lower() for word in words if word.lower() not in vocab])
test_set_unk = ' '.join(['<unk>' if word.lower() in oov_words else word for word in words])

print(test_set_unk)

the quick brown <unk> jumps over the lazy dog


# Rare word count

In [7]:
def rare_word_count(text, word_counts, threshold):
    """
    Computes the count of rare words in a text.
    
    Args:
        text (str): The input text to compute the rare word count for.
        word_counts (dict): A dictionary containing the counts of each word in the corpus.
        threshold (int): The threshold for a word to be considered "rare".
        
    Returns:
        int: The count of rare words in the input text.
    """
    # Split the text into words
    words = text.split()
    
    # Compute the count of rare words
    rare_word_count = sum([1 for word in words if word_counts.get(word, 0) < threshold])
    
    return rare_word_count

### Example of use

In [8]:
import collections

# Load the corpus into memory
corpus = ['this is a sentence', 'this is another sentence', 'yet another sentence']

# Compute the word counts for the corpus
word_counts = collections.Counter()
for sentence in corpus:
    words = sentence.split()
    word_counts.update(words)


In [9]:
# Define a text to compute the rare word count for
text = 'this is a sentence with some rare words'

# Compute the count of rare words
rare_count = rare_word_count(text, word_counts, 2)

# Print the result
print(f'The text contains {rare_count} rare words.')


The text contains 5 rare words.


# Named entity overlap

Named entity overlap refers to the measure of similarity between two texts based on the number and type of named entities they share. Named entities are words or phrases that refer to specific entities or concepts, such as people, organizations, locations, dates, etc. 

For example, consider the following two sentences:

- John Smith works at Google.
- Google is a technology company based in California.

Both sentences contain a named entity "Google", which is a type of organization. If we calculate the named entity overlap between these two sentences, we would find that they share one named entity in common.

In [10]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

def named_entity_overlap(text1, text2):
    """
    Computes the named entity overlap between two texts.
    
    Args:
        text1 (str): The first text.
        text2 (str): The second text.
        
    Returns:
        float: The named entity overlap score between the two texts.
    """
    # Tokenize the texts into sentences
    sentences1 = nltk.sent_tokenize(text1)
    sentences2 = nltk.sent_tokenize(text2)
    
    # Identify the named entities in each text
    entities1 = set()
    for sentence in sentences1:
        tokens = nltk.word_tokenize(sentence)
        tagged = nltk.pos_tag(tokens)
        named_entities = nltk.ne_chunk(tagged, binary=False)
        for entity in named_entities:
            if isinstance(entity, nltk.tree.Tree):
                entity_name = " ".join([token[0] for token in entity])
                entities1.add(entity_name)
                
    entities2 = set()
    for sentence in sentences2:
        tokens = nltk.word_tokenize(sentence)
        tagged = nltk.pos_tag(tokens)
        named_entities = nltk.ne_chunk(tagged, binary=False)
        for entity in named_entities:
            if isinstance(entity, nltk.tree.Tree):
                entity_name = " ".join([token[0] for token in entity])
                entities2.add(entity_name)
    
    print('Entities found for text 1: ', entities1)
    print('Entities found for text 2: ', entities2)
    
                
    # Compute the named entity overlap between the two texts
    overlap = len(entities1.intersection(entities2)) / float(len(entities1.union(entities2)))
    
    return overlap

[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: Hostname mismatch, certificate is not valid
[nltk_data]     for 'raw.githubusercontent.com'. (_ssl.c:1129)>
[nltk_data] Error loading maxent_ne_chunker: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     Hostname mismatch, certificate is not valid for
[nltk_data]     'raw.githubusercontent.com'. (_ssl.c:1129)>


In [11]:
text1 = "John Smith works at Google in California."
text2 = "Google is a technology company based in California."

overlap = named_entity_overlap(text1, text2)

print("Named entity overlap:", overlap)

Entities found for text 1:  {'California', 'John', 'Google', 'Smith'}
Entities found for text 2:  {'Google', 'California'}
Named entity overlap: 0.5


The score is 0.5 because the two text have 2 entities in common: Google and California, and we have a total of 4 entities. So the named entity overlap is computed as: number of common entites / total number of entities. In this case is 2/4 = 0.5

In [12]:
text1 = "I saw a movie at the AMC theater with my friends."
text2 = "I ate dinner at a new Italian restaurant with my family."

overlap = named_entity_overlap(text1, text2)

print("Named entity overlap:", overlap)

Entities found for text 1:  {'AMC'}
Entities found for text 2:  {'Italian'}
Named entity overlap: 0.0


# Word2vec and Fasttext embeddings

In [21]:
import gensim.downloader as api
from scipy.spatial.distance import cosine

model = api.load("word2vec-google-news-300")

def compute_word2vec_embeddings(text):
    """
    Computes the word2vec embedding for a given text by taking the mean of embeddings of all the words in the text.

    Args:
        text (str): The input text for which the word embeddings need to be computed.

    Returns:
        numpy.ndarray or None: The computed embedding for the given text. If no embeddings are found, returns None.
    """
    # Convert text to lowercase and split it into individual words
    words = text.lower().split()

    # Initialize empty list for embeddings
    embeddings = []

    # Iterate through each word in the text
    for word in words:
        # Check if the word is present in the word2vec model's vocabulary
        if word in model.index_to_key:
            # If the word is present, append its embedding to the list of embeddings
            embeddings.append(model[word])

    # If no embeddings were found, return None
    if len(embeddings) == 0:
        return None
    else:
        # Take the mean of all embeddings to get a single embedding for the entire text
        return np.mean(embeddings, axis=0)

def compute_cosine_similarity(embedding1, embedding2):
    """
    Computes the cosine similarity between two given word embeddings.

    Args:
        embedding1 (numpy.ndarray or None): The first word embedding.
        embedding2 (numpy.ndarray or None): The second word embedding.

    Returns:
        float or None: The cosine similarity between the two embeddings. If either of the embeddings is None, returns None.
    """
    # Check if either of the embeddings is None
    if embedding1 is None or embedding2 is None:
        return None
    else:
        # Compute the cosine similarity between the two embeddings
        return 1 - cosine(embedding1, embedding2)


question1 = "How can I prevent sunburn?"
question2 = "How can I prevent sunburn?"

embedding1 = compute_word2vec_embeddings(question1)
embedding2 = compute_word2vec_embeddings(question2)

similarity = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity (same question): {similarity}")
print("")

question1 = "What are some good ways to prevent and treat sunburn?"
question2 = "How can I prevent sunburn?"

embedding1 = compute_word2vec_embeddings(question1)
embedding2 = compute_word2vec_embeddings(question2)

similarity = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity (similar questions): {similarity}")
print("")


question1 = "How do I learn Python?"
question2 = "Where is the limit of the universe?"

embedding1 = compute_word2vec_embeddings(question1)
embedding2 = compute_word2vec_embeddings(question2)

similarity = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity (different questions): {similarity}")

Cosine similarity (same question): 1

Cosine similarity (similar questions): 0.6707423329353333

Cosine similarity (different questions): 0.34357038140296936


In [15]:
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English

In [16]:
ft_model = fasttext.load_model('cc.en.300.bin')

def compute_fasttext_embeddings(text):
    """
    Computes the FastText embedding for a given text by taking the mean of embeddings of all the words in the text.

    Args:
        text (str): The input text for which the word embeddings need to be computed.

    Returns:
        numpy.ndarray or None: The computed embedding for the given text. If no embeddings are found, returns None.
    """
    # Convert text to lowercase and split it into individual words
    words = text.lower().split()

    # Initialize empty list for embeddings
    embeddings = []

    # Iterate through each word in the text
    for word in words:
        embeddings.append(ft_model.get_word_vector(word))
    # If no embeddings were found, return None
    if len(embeddings) == 0:
        return None
    else:
        # Take the mean of all embeddings to get a single embedding for the entire text
        return np.mean(embeddings, axis=0)



In [17]:
question1 = "How can I prevent sunburn?"
question2 = "How can I prevent sunburn?"

embedding1 = compute_fasttext_embeddings(question1)
embedding2 = compute_fasttext_embeddings(question2)

similarity = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity (same question): {similarity}")
print("")

question1 = "What are some good ways to prevent and treat sunburn?"
question2 = "How can I prevent sunburn?"

embedding1 = compute_fasttext_embeddings(question1)
embedding2 = compute_fasttext_embeddings(question2)

similarity = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity (similar questions): {similarity}")
print("")


question1 = "How do I learn Python?"
question2 = "Where is the limit of the universe?"

embedding1 = compute_fasttext_embeddings(question1)
embedding2 = compute_fasttext_embeddings(question2)

similarity = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine similarity (different questions): {similarity}")

Cosine similarity (same question): 1

Cosine similarity (similar questions): 0.6515657305717468

Cosine similarity (different questions): 0.4463950991630554


### Use other metrics

In [33]:
from scipy.spatial.distance import euclidean

distance = euclidean(embedding1, embedding2)
similarity = 1 / (1 + distance)
print(similarity)

0.4115611855086013


In [34]:
from scipy.spatial.distance import cityblock

distance = cityblock(embedding1, embedding2)
similarity = 1 / (1 + distance)
print(similarity)

0.04907270311233638
