In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
test = [nlp("i like trains"), nlp("i don't like trains")]

In [4]:
import numpy as np

len(test[0].vector)

300

In [5]:
test[0].similarity(test[1])

0.9298600073952216

In [6]:
import json

In [41]:
data = None
with open("./features-mfc_text.json", "r") as rfile:
    data = json.load(rfile)

In [42]:
data = data['queryData']

In [46]:
test_paragraph = list(data['raw'].values())[0][0]

print(test_paragraph)

print(nlp(test_paragraph).vector)


>
[ 6.2983e-02  2.3782e-01 -1.6229e-01 -9.9808e-02  7.9680e-01  1.6148e-01
  2.7428e-01 -1.7174e-02  5.3938e-01  5.2981e-01 -5.1177e-01 -4.7987e-01
  3.5326e-01 -2.7172e-01 -8.8922e-02 -2.5101e-01 -1.3392e-01  1.2531e+00
 -9.5383e-02 -1.6047e-01  1.1619e-01 -1.0296e-02 -2.2619e-01 -6.5673e-01
 -1.7455e-01 -6.1507e-01  4.4272e-01 -3.0587e-01  1.8314e-01  3.2081e-02
 -4.7337e-01  3.5571e-01  1.9935e-01  9.3620e-02  3.9080e-01 -1.3259e-01
 -5.0027e-01  3.3630e-01 -1.8837e-01  2.7080e-01  2.3933e-01  6.8090e-01
  1.7654e-01  4.7903e-01 -5.6406e-01 -5.5961e-01  7.9853e-02  1.4055e-02
  1.6635e-01  1.5758e-01 -1.2784e-01  2.6583e-01 -6.2025e-02 -3.5037e-02
 -1.4938e-01  9.3294e-04 -6.7079e-01 -5.4639e-01 -7.9506e-02 -1.9183e-01
 -5.1943e-01 -2.7233e-01  2.2175e-01 -3.5535e-01  6.1282e-01  8.5555e-02
 -1.5312e-01  3.7764e-01  2.7109e-01 -9.0895e-02  3.4908e-01  1.6253e-01
 -1.0036e-01  2.3266e-01  9.1574e-01  7.9775e-01 -4.1173e-01 -2.3230e-01
 -8.9770e-02 -2.8802e-01 -2.0416e-02  2.9105e-01 

In [78]:
import re

def get_paragraph_embeddings(data, nlp, min_doc_len = 10):
    
    urls = list(data['raw'].keys())
    
    paragraph_embeddings = np.zeros([1,300])
    
    for url in urls:
        
        for paragraph in data['raw'][url]:
            
            if len(paragraph) > min_doc_len:
                paragraph_embeddings = np.concatenate((paragraph_embeddings, nlp(paragraph).vector[:,np.newaxis].T), axis=0)
            else:
                paragraph_embeddings = np.concatenate((paragraph_embeddings, np.zeros([1,300])), axis=0)
        
    return paragraph_embeddings
    

    
        

In [79]:
p_embeddings = get_paragraph_embeddings(data, nlp)

In [80]:
p_embeddings.shape

(15516, 300)

In [81]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [82]:
model = KMeans(n_clusters=15)
scaler = StandardScaler()
norm_p_data = scaler.fit_transform(p_embeddings)

In [83]:
model.fit(norm_p_data)

KMeans(n_clusters=15)

In [84]:
centroids = model.cluster_centers_


In [85]:
from numpy.linalg import norm

centroid_norms = norm(centroids, axis = 1)
centroid_norms

array([ 6.18495573, 13.14363871, 18.40538196, 10.05729846,  5.51826104,
        5.14127209, 32.24653132, 11.88654789,  5.20187423, 28.24087105,
        6.54443483,  7.70116528,  6.12374557,  7.10556138,  6.3273087 ])

In [86]:
p_norms = norm(norm_p_data, axis=1)

In [87]:
p_norms.shape

(15516,)

In [88]:
sims = []

eps = np.array([0.0001] * p_norms.shape[0])

for i,centroid in enumerate(centroids):
    sims.append((p_embeddings @ centroid)/((p_norms*centroid_norms[i]) + eps))

In [89]:
sims

[array([ 0.        ,  0.        , -0.00179369, ...,  0.03817856,
         0.02194758,  0.00965754]),
 array([ 0.        ,  0.        , -0.03303529, ..., -0.20372383,
        -0.17862017, -0.12250312]),
 array([ 0.        ,  0.        , -0.00696822, ..., -0.09262263,
        -0.05196133, -0.05125579]),
 array([0.        , 0.        , 0.00851148, ..., 0.05084839, 0.04676363,
        0.04811449]),
 array([0.        , 0.        , 0.00730242, ..., 0.0147309 , 0.01394976,
        0.00902825]),
 array([0.        , 0.        , 0.01370431, ..., 0.04860636, 0.04178838,
        0.04969927]),
 array([ 0.        ,  0.        , -0.00267586, ..., -0.06470029,
        -0.03591674, -0.0393745 ]),
 array([ 0.        ,  0.        , -0.00392595, ...,  0.01284462,
        -0.00282267, -0.00442097]),
 array([0.        , 0.        , 0.00198391, ..., 0.0645111 , 0.04223795,
        0.02841419]),
 array([ 0.        ,  0.        ,  0.00478498, ...,  0.0014781 ,
        -0.02290019, -0.02035925]),
 array([0.    

In [90]:
np.shape(sims)

(15, 15516)

In [91]:
exemplar_indices = [np.argmax(sim) - 1 for sim in sims]

In [92]:
exemplar_indices

[2103,
 10890,
 5230,
 7413,
 1579,
 9814,
 916,
 2102,
 13747,
 300,
 10104,
 11620,
 5660,
 2845,
 5230]

In [93]:
paragraphs = []

for url in data['raw']:
    paragraphs = paragraphs + data['raw'][url]

for exemplar in exemplar_indices:
    print(paragraphs[exemplar])
    print("\n")

The analysis concludes that, in contrast to the U.S. position  at the talks, U.S. energy use, and thus emissions of carbon  dioxide, could be substantially reduced - with almost no harm to  the economy - by raising standards for energy and fuel efficiency,  increasing spending on government and private energy research and  adopting a national trading scheme for carbon dioxide cuts that gives a financial incentive to companies to go beyond required  reductions.




Dion Nickelson




Andrew Ross, Denver




The main difference now appears to be that more scientists - and others - are convinced that they know what's going to happen, and it's not good.




The United States signed the protocol in 1997 under President Bill Clinton, but the Senate refused to ratify it.


The event is his biggest effort so far to get Americans to recognize that global warming is one of the great challenges this country will face in the coming century, and that in his judgment, the time to begin dealing with 