In [None]:
#@title << Setup Google Colab by running this cell {display-mode: "form"}
import sys
if 'google.colab' in sys.modules:
    # Clone GitHub repository
    !git clone https://github.com/pxydi/text.git
        
    # Copy files required to run the code
    !cp -r "text/data" "text/plots" "text/tools.py" .
    
    # Install packages via pip
    !pip install -r "text/colab-requirements.txt"
    
    # Restart Runtime
    import os
    os.kill(os.getpid(), 9)

# Text similarity

In the previous notebook, we transformed samples of text into lists of numbers using various methods. In each of these methods, the numbers in the vectors are defined somehow differently: 
* in Bag of words, they correspond to word counts.
* in Tf-idf, they correspond to word counts, re-weighted by the *inverse document frequency*.
* in sentence embeddings, they are learned from the data using an "embedding method" in such a way that they encode semantic relationships; similar texts have similar embeddings.

## Visualize data

We already mentionned that we can think of these lists of numbers as points in (a high-dimensional) space. We can plot them and try to look for patterns, i.e. clusters of similar documents.

However, we can only plot in 2 or 3 dimensions, not more. This means that we need to perform "dimensionality reduction", which consists in "compressing" the data into few 2 (or 3) dimensions without losing too much information. 

In [None]:
# Load libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random, re, os
import contractions

import tools

In [None]:
# Load data

path = os.path.join('data','clean_sentiment_140.csv')
df   = pd.read_csv(path)

# Show a few samples
df.head()

In [None]:
# Load word vectors

with np.load('data/word_vectors.npz',allow_pickle=False) as data:
    X_bow   = data['BOW_features']
    X_tfidf = data['tfidf_features']
    X_embed = data['embeddings']
    
print('Data loaded.')
print('BOW features: ',X_bow.shape)
print('Tf-idf features: ',X_tfidf.shape)
print('Embeddings: ',X_embed.shape)

## Visualize data

Machine learning can help with data visualization. A few popular techniques are

- [Principal component analysis](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)
- [Truncated SVD](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html)
- [T-SNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html)

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.pipeline import make_pipeline

decomp = make_pipeline(PCA(n_components=40), TSNE(n_components=2))

# BOW features
X_decomp_bow   = decomp.fit_transform(X_bow)

# Embeddings
X_decomp_embed = decomp.fit_transform(X_embed)

print(X_decomp_bow.shape)

In [None]:
# Plot results
fig,axes = plt.subplots(1,2,figsize=(12,5))

#Plot X_decomp_bow
axes[0].scatter(X_decomp_bow[:,0],X_decomp_bow[:,1],alpha=0.8)
axes[0].set_xlabel('TSNE 1')
axes[0].set_ylabel('TSNE 2');
axes[0].set_title('Bag of words');

#Plot X_decomp_embed
axes[1].scatter(X_decomp_embed[:,0],X_decomp_embed[:,1],alpha=0.8)
axes[1].set_xlabel('TSNE 1')
axes[1].set_ylabel('TSNE 2');
axes[1].set_title('Embeddings');

Let's focus ...

In [None]:
selected_labels = ['movies','Twitter', 'politics', 'sports', 'IT', 'books']

tools.visualize_bow_embeddings(X_decomp_bow,X_decomp_embed,df,label='books')

Each point in the plot represents a tweet. Using ML here has allowed grouping together tweets that are discussing the same topic. For example, we can see that tweets talking about Obama, North Korea, Iran, or China (i.e. politics) are grouped in the lower right corner of the plot. Similarly, tweets about (American) cable tv (Time Warner, ESPN, or Comcast) all appear together in the top right corner. In fact, there are many such groups in this plot (about movies, sports, food, etc).
What I like about it is that an algorithm has managed to capture semantic relationships between tweets, even if the tweets don’t use the same words. The ML algorithms used are capable of detecting similarities between e.g. “Night at the museum” and “Star Trek”, and then putting them close to each other in the plot. We, humans, would have to read all of these tweets (several thousand) and assign them to different groups one by one. ML can do this in two lines of code!


So, Bag of words, will determine similarity based on the words.

## Distance between vectors

Two vectors are similar if they point in the same direction. In a word space, this means that two documents use the same words, in the same proportions, hence they are likely to be discussing the same thing.

The idea is that if documents use the same words in the same proportions, then their document vecrors will be closeby in the word space; they will be pointing in the same direction and having similar lengths.

In [None]:
# Toy vectors

doc1 = np.array([1,3])
doc2 = np.array([16,2])
doc3 = np.array([18,5])

docs = np.zeros((1,2))

for doc in [doc1,doc2,doc3]:
    docs = np.vstack((docs,doc))
    
docs = docs[1:]
tools.plot_vectors(doc1,doc2,doc3,plot_difference=True)

We can compare vectors using: 
* the euclidean distance
* the cosine similarity

### Euclidean distance 

The euclidean distance is simply the distancee between two vectors.

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
index_labels = ['doc'+str(i+1) for i in range(0,3)]
column_labels = index_labels

pd.DataFrame(euclidean_distances(docs), index = index_labels, columns = column_labels)

However, the euclidean distance can be misleading if used to compare vectors of different lengths. It's better to use the Euclidean distance with vectors of the same length. We will fist normalize the document vectors (ensuring they all have a length of 1) and will recompute the Euclidean distance. 

In [None]:
# Plot normalized vectors

tools.plot_vectors(tools.normalize_vector(doc1),
                   tools.normalize_vector(doc2),
                   tools.normalize_vector(doc3),
                   plot_difference=True)

In [None]:
# Compute euclidean distances using normalized vectors

pd.DataFrame(euclidean_distances(tools.normalize_vector(docs)), index = index_labels, columns = column_labels)

### Cosine similarity

We saw that the Euclidean distance can be affected by the vectors' length. The cosine similarity is another commonly used metric to mesure document similarity, which isn't affected by the vectors' length.

The cosine similarity expresess the cosine of the angle between two vectors. 

* If the angle between two vectors is small (𝜽 -> 0), then the cosine of 𝜽 will be close to 1. 
* If the angle between two vectors is big (𝜽 -> 90), then the cosine of 𝜽 will be close to 0.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Compute the cosine similarity

pd.DataFrame(cosine_similarity(docs), index = index_labels, columns = column_labels)

In [None]:
pd.DataFrame(euclidean_distances(tools.normalize_vector(docs)), index = index_labels, columns = column_labels)

Let's compare our tweets using the Twitter dataset. 

In [None]:
# Select features Features

X = X_embed
X[0:3]

In [None]:
# TODO: Try also with cosine similarity
#similarity_df = pd.DataFrame(euclidean_distances(normalize_vector(X)))
similarity_df = pd.DataFrame(cosine_similarity(X))

similarity_df.index   = ['doc_'+str(i) for i in range(0,len(X))]
similarity_df.columns = ['doc_'+str(i) for i in range(0,len(X))]

similarity_df.head()

In [None]:
# Plot similarity metric using heatmaps

tools.plot_similarity(similarity_df)

In [None]:
def find_neighbohrs(idx,X,data = df):
    
    neighbohrs_df = pd.DataFrame()
    
    df_cos  = pd.DataFrame(cosine_similarity(X))
    df_dist = pd.DataFrame(euclidean_distances(tools.normalize_vector(X)))
    
    # Sort neighbors with respect to cosine similarity
    neighborhs = np.argsort(df_cos.iloc[idx,:])[::-1]
    
    neighbohrs_df['processed_tweet'] = df.iloc[neighborhs,1]
    neighbohrs_df['cosine_similarity'] = df_cos.iloc[idx,neighborhs]
    neighbohrs_df['euclidean_distance'] = df_dist.iloc[idx,neighborhs]
    neighbohrs_df['label'] = df.iloc[neighborhs,-2]
    neighbohrs_df['semantic_category'] = df.iloc[neighborhs,-1]

    return neighbohrs_df.head(10)
 
# Randomly sampled tweet

idx = random.randint(0,len(df)-1)
print('Doc idx: {}'.format(idx))

pd.set_option('max_colwidth', None)
find_neighbohrs(idx,X_embed)

In [None]:
np.linalg.norm(X_tfidf,axis=1)

In [None]:
np.linalg.norm(X_bow[np.argsort(np.linalg.norm(X_bow,axis=1))][0:50],axis=1)

In [None]:
np.linalg.norm(X_tfidf[np.argsort(np.linalg.norm(X_tfidf,axis=1))][0:50],axis=1)

In [None]:
np.argsort(np.linalg.norm(X_bow,axis=1))

In [None]:
df.iloc[[113, 117, 346,  66]]

In [None]:
df.iloc[[113,  65,  66, 117, 346]]